<a href="https://colab.research.google.com/github/worldwidekatie/solnechnyy-svet/blob/master/solnechnyy_svet__categorizing_test2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import sklearn as sk
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_selection import SelectPercentile, f_classif

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/worldwidekatie/Build_Week_2/master/ira_cleaned_data.csv')
df = df.copy()
df = df[['content', 'target']]
df.head()

Unnamed: 0,content,target
0,#adee RT davis1988will: Congratulations for Ma...,1.0
1,RT SSOL getting attention. It's penny play day...,1.0
2,#laup SHOCK VIDEO : Antifa Thugs Break a Latin...,1.0
3,PROOF Melania Has Done FAR MORE for Disaster R...,1.0
4,"An USC professor, Raphael Bostic, named first ...",1.0


I'm estimating with a very high estimate that .00005% of tweets are IRA tweets. It's probably less than that.

My sample has 5% IRA tweets, so it's a bit oversampled.

In [3]:
df.target.value_counts(normalize=True)

0.0    0.947465
1.0    0.052535
Name: target, dtype: float64

In [4]:
train, val = train_test_split(df, random_state=42)
print(train.shape, val.shape)

(151984, 2) (50662, 2)


In [5]:
train, test = train_test_split(train, random_state=42)
print(train.shape, test.shape)

(113988, 2) (37996, 2)


In [0]:
target = 'target'
features = 'content'

X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

#The pipeline below is the highest recall I could get 88.9%! Woot!

In [11]:
pipeline = make_pipeline(
    TfidfVectorizer(),
    SelectPercentile(f_classif, percentile=10),
    PassiveAggressiveClassifier(max_iter=60)
)
param_distributions = { 
    #'selectpercentile__percentile':[5,10,30,40,60,90],
    'passiveaggressiveclassifier__class_weight':[{1:0.52808406, 0:19.40184758}, 
                                                 {1:0.52808406, 0:9.40184758},
                                                 'balanced', None],
    #'passiveaggressiveclassifier__max_iter':[10,20,30,40,50,60,70,100,200,500,None]
}


search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_distributions, 
    n_iter=10, #Changed from 10
    cv=5, #Changed from 5
    scoring='recall', 
    verbose=1, 
    return_train_score=True
)

search.fit(X_train, y_train);

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  1.1min finished


In [12]:
print('Best hyperparameters', search.best_params_)
print('Cross-validation Accuracy', -search.best_score_)
pipeline = search.best_estimator_
print("Train Accuracy:", pipeline.score(X_train, y_train))
print("Validation Accuracy:", pipeline.score(X_val, y_val))

Best hyperparameters {'passiveaggressiveclassifier__class_weight': 'balanced'}
Cross-validation Accuracy -0.8812293389272975
Train Accuracy: 0.9838579499596449
Validation Accuracy: 0.9724843077651889


In [13]:
y_pred = pipeline.predict(X_val)
tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
print("Precision:", tp /(tp+fp))  
print("Recall:", tp/(tp+fn))

Precision: 0.6743853630646084
Recall: 0.9024483550114767


# Beginning to work on feature importances and explainability but still have a very long way to go

In [10]:
vectorizer = pipeline.named_steps['tfidfvectorizer']
features = pd.Series(vectorizer.vocabulary_)
features.head(50)

disappointed      30277
with             111164
the              100901
tt               104763
today            103116
ruined            87923
by                19133
hurricane         47743
headwind          45239
doing             31172
make              64417
up               106858
homework          46808
missing           68593
school            89848
all                8157
time             102544
for               38835
acting             6334
is                50520
getting           41331
little            61689
annoying           9981
homeschooling     46800
would            111789
be                13854
so                94360
much              70857
easier            32989
ouvindo           76912
guns              43681
roses             87473
november          74565
rain              83810
new               73017
proposal          82033
takes             99213
care              20075
of                75369
border            16972
wall             109233
funding         