In [1]:
from textblob import TextBlob, Word
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('new_export_dataframe.csv')
df.head()

Unnamed: 0,who,category,sentences
0,Client:,Sexist,em pulling out page of the sun newspaper ...
1,Client:,Sexist,are you some sort of faggot
2,Client,Sexist,i d like you to update the site again as pa...
3,Client:,Sexist,every time i turn a page of the book i m trans...
4,Me:,Sexist,alright since your editors aren t really tec...


In [3]:
df['deadbeats'] = np.where((df['category'] == 'Deadbeats'), 1, 0)

In [4]:
df.head()

Unnamed: 0,who,category,sentences,deadbeats
0,Client:,Sexist,em pulling out page of the sun newspaper ...,0
1,Client:,Sexist,are you some sort of faggot,0
2,Client,Sexist,i d like you to update the site again as pa...,0
3,Client:,Sexist,every time i turn a page of the book i m trans...,0
4,Me:,Sexist,alright since your editors aren t really tec...,0


In [5]:
from nltk.corpus import stopwords
en_stopwords = stopwords.words("english")

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
word_count_vectors = tfidf.fit_transform(
    df['sentences'].apply(
        lambda x: ' '.join([Word(w).lemmatize() for w in x.split() if w not in en_stopwords])
    ).values).todense().tolist()

In [7]:
df.head()

Unnamed: 0,who,category,sentences,deadbeats
0,Client:,Sexist,em pulling out page of the sun newspaper ...,0
1,Client:,Sexist,are you some sort of faggot,0
2,Client,Sexist,i d like you to update the site again as pa...,0
3,Client:,Sexist,every time i turn a page of the book i m trans...,0
4,Me:,Sexist,alright since your editors aren t really tec...,0


In [8]:
deadbeats = pd.DataFrame(
    data = word_count_vectors,
    columns = tfidf.get_feature_names()
)
deadbeats

Unnamed: 0,abandoned,ability,able,absolute,absolutely,accent,accept,acceptable,accepted,accepts,...,yet,young,youre,youtube,youve,yr,yyyy,zip,zone,zoomed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
features = deadbeats

In [10]:
target = df['deadbeats']

In [11]:
target.value_counts()

0    3342
1     871
Name: deadbeats, dtype: int64

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier


classifiers = {'Logistic Regression': LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr'),
              'Gaussian NB': GaussianNB(),
              'Bernoulli NB': BernoulliNB(),
              'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0),
              'KNeighborsClassifier': KNeighborsClassifier(), 
              'Linear SVC': LinearSVC(),
              'Neural Network':  MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)}
              
def compare_models(x, y, model_dict, folds=3):
    results = []
    for name, model in model_dict.items():
        scores = cross_val_score(model, x, y, cv=folds)
        stats = [name, scores.mean(), min(scores), max(scores), scores.std(), pd.Series(scores).mad(), scores]
        results.append(stats)
    df = pd.DataFrame(results, columns = ['Model', 'Mean', 'Min', 'Max','Std', 'Mad', 'Score'])
    df = df.sort_values('Mean', ascending = False)
    return df

In [13]:
compare_models(features, target, classifiers, folds=3)

Unnamed: 0,Model,Mean,Min,Max,Std,Mad,Score
0,Logistic Regression,0.802752,0.79416,0.807829,0.006109,0.005728,"[0.8078291814946619, 0.7941595441595442, 0.806..."
3,Random Forest,0.793259,0.792883,0.793447,0.000266,0.000251,"[0.79288256227758, 0.7934472934472935, 0.79344..."
5,Linear SVC,0.778064,0.75,0.79573,0.020065,0.018709,"[0.7957295373665481, 0.75, 0.7884615384615384]"
2,Bernoulli NB,0.753378,0.726496,0.770107,0.019197,0.017922,"[0.7701067615658364, 0.7264957264957265, 0.763..."
6,Neural Network,0.742466,0.698006,0.79416,0.039587,0.034463,"[0.7352313167259786, 0.698005698005698, 0.7941..."
4,KNeighborsClassifier,0.717301,0.711538,0.728826,0.008149,0.007683,"[0.7288256227758008, 0.7115384615384616, 0.711..."
1,Gaussian NB,0.491339,0.479004,0.502849,0.009752,0.008224,"[0.4790035587188612, 0.5028490028490028, 0.492..."


In [14]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(
    features,
    target,
    test_size = .2,
    random_state = 42
)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score

log_regr = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')
log_regr.fit(train_X, train_y.values.ravel())
predictions = log_regr.predict(test_X)
actual_values = test_y
print('acurracy: ',accuracy_score(actual_values, predictions)) 
print('precision: ',precision_score(actual_values, predictions)) 
print('recall: ', recall_score(actual_values, predictions))
print('f1_score: ', f1_score(actual_values, predictions))

acurracy:  0.8279952550415184
precision:  0.7741935483870968
recall:  0.14814814814814814
f1_score:  0.2487046632124352


In [16]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10).fit(train_X, train_y.values.ravel())

predictions = rfc.predict(test_X)
actual_values = test_y

print('roc_auc_score: ', roc_auc_score(actual_values, predictions) )
print('acurracy: ',accuracy_score(actual_values, predictions)) 
print('precision: ',precision_score(actual_values, predictions)) 
print('recall: ', recall_score(actual_values, predictions))
print('f1_score: ', f1_score(actual_values, predictions))

roc_auc_score:  0.6224506444770762
acurracy:  0.8232502965599051
precision:  0.5783132530120482
recall:  0.2962962962962963
f1_score:  0.39183673469387753


In [17]:
from sklearn.utils import resample

# concatenate our training data back together
X = pd.concat([train_X, train_y], axis=1)

# separate minority and majority classes
not_deadbeats = X[X.deadbeats==0]
deadbeats = X[X.deadbeats==1]

# upsample minority
deadbeats_upsampled = resample(deadbeats,
                          replace=True, # sample with replacement
                          n_samples=len(not_deadbeats), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_deadbeats, deadbeats_upsampled])

# check new class counts
upsampled.deadbeats.value_counts()

1    2661
0    2661
Name: deadbeats, dtype: int64

In [18]:
train_X = upsampled.drop('deadbeats', axis=1)
train_y = upsampled['deadbeats']

log_regr = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')
log_regr.fit(train_X, train_y.values.ravel())
predictions = log_regr.predict(test_X)
actual_values = test_y
print('roc_auc_score: ', roc_auc_score(actual_values, predictions) )
print('acurracy: ',accuracy_score(actual_values, predictions)) 
print('precision: ',precision_score(actual_values, predictions)) 
print('recall: ', recall_score(actual_values, predictions))
print('f1_score: ', f1_score(actual_values, predictions))

roc_auc_score:  0.7045059009082505
acurracy:  0.7924080664294187
precision:  0.4666666666666667
recall:  0.5617283950617284
f1_score:  0.5098039215686275


In [19]:
from sklearn.utils import resample

# downsample minority
deadbeats_downsampled = resample(not_deadbeats,
                          replace=False, # sample with replacement
                          n_samples=len(deadbeats), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
downsampled = pd.concat([deadbeats, deadbeats_downsampled])

# check new class counts
downsampled.deadbeats.value_counts()

1    709
0    709
Name: deadbeats, dtype: int64

In [20]:
train_X = downsampled.drop('deadbeats', axis=1)
train_y = downsampled['deadbeats']

log_regr = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')
log_regr.fit(train_X, train_y.values.ravel())
predictions = log_regr.predict(test_X)
actual_values = test_y
print('roc_auc_score: ', roc_auc_score(actual_values, predictions) )
print('acurracy: ',accuracy_score(actual_values, predictions)) 
print('precision: ',precision_score(actual_values, predictions)) 
print('recall: ', recall_score(actual_values, predictions))
print('f1_score: ', f1_score(actual_values, predictions))

roc_auc_score:  0.6779518137814761
acurracy:  0.7153024911032029
precision:  0.3597122302158273
recall:  0.6172839506172839
f1_score:  0.45454545454545453
