# Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


## Load document vectors from `dataprocessing` ipynb

In [2]:
df = pd.read_csv('cleaned_data.csv')

df.columns


Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count', 'cleaned_text', 'is_toxic'],
      dtype='object')

In [7]:
import pickle

X = ''
# open and load tags from pickle file
# with open('document_vectors.pickle', 'rb') as handle:
#     X = pickle.load(handle)

with open('LR_document_vectors.pickle', 'rb') as handle:
    X = pickle.load(handle)

In [8]:
# combined files with the word embeddings and labels
y = df['is_toxic']
# y = np.where(df['toxicity']>=0.5, 1, 0)
print(len(X), len(y))


857934 857934


## Train Test Split

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


## TRAIN LOGISTIC REGRESSION

In [10]:
# Prepare data
# X = [doc2vec_model.infer_vector(doc.words) for doc in tagged_data]
# y = [doc.tags[0] for doc in tagged_data]

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression classifier
logreg_classifier = LogisticRegression()
logreg_classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = logreg_classifier.predict(X_test)

# Evaluate classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.6669114390283702


In [12]:
predictions_proba = logreg_classifier.predict_proba(X_test)[:30, 1] 
print(predictions_proba)

print(y_test[:30])
print(y_pred[:30])

[0.74905141 0.58741639 0.28185649 0.20795242 0.05247452 0.52864872
 0.61905166 0.54465442 0.7304845  0.45306829 0.33209656 0.20506908
 0.86102194 0.49922397 0.28777097 0.7260824  0.36289393 0.30558214
 0.37011201 0.43164269 0.0758166  0.34006381 0.35297228 0.47274113
 0.0382508  0.70498946 0.78529529 0.61375058 0.36547252 0.77546721]
523432    1
721108    1
195647    0
418740    0
725922    1
209455    1
679707    1
647453    1
435284    0
804960    0
341759    0
809882    1
707604    1
742187    1
186174    1
416267    1
455788    0
264423    0
809465    0
731031    1
740538    1
166709    1
645939    1
468541    0
469709    0
614761    1
772259    1
439648    1
60366     0
764883    1
Name: is_toxic, dtype: int64
[1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1]


## Save model

In [13]:
import joblib 
 
# save 
joblib.dump(logreg_classifier, "LR_model.pkl")  


['LR_model.pkl']

In [15]:
# load 
clf2 = joblib.load("LR_model.pkl") 
 
clf2.predict(X_test[0:1])

array([1], dtype=int64)