In [2]:
import pandas as pd
import numpy as np
import utility
import re
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
import pickle

In [3]:
data =  pd.read_csv('Data/data_no_duplicates.csv', sep=',')
data

Unnamed: 0.1,Unnamed: 0,id,text,misogynous,misogyny_category,target
0,0,1,Please tell me why the bitch next to me in the...,1,dominance,active
1,1,3,"@abzdafab Dear cunt, please shut the fuck up.",1,dominance,active
2,2,4,RT @queenofdragonsb: Pls shut the fuck up bitch,1,dominance,active
3,3,5,"RT @21bIvck: ""when u gonna get your license"" S...",1,dominance,active
4,4,6,@SarahhWaqar @CallmeJaagii Bitch shut the fuck up,1,dominance,active
5,5,7,ok babies i'll go to sleep ok bitch shut the f...,1,dominance,passive
6,6,8,You are a fucking ugly bitch!shut the fuck up?...,1,dominance,active
7,7,9,RT @bnixole: bitch shut the fuck up you're fuc...,1,dominance,active
8,8,10,RT @vaintshit: shut the fuck up and come suck...,1,sexual_harassment,active
9,9,11,@AnisaJomha ice has more talent in his ejecula...,1,sexual_harassment,active


In [4]:
# preprocess text: to lower case, http instead of a full link, @ instead of a full @-mention
utility.text_preprocessing(data, 'text')
# transform emoji to text
utility.demojize(data, 'text')

# Misogyny Detection

## Count Vectorizer + ML Models

In [5]:
# split data into train and test sets
split = StratifiedShuffleSplit(test_size = 0.1, random_state = 42)
for train_index, test_index in split.split(data, data[['misogynous', 'misogyny_category', 'target']]):
    train = data.loc[train_index]
    test = data.loc[test_index]

In [6]:
# convert tweets to a matrix of unigrams and bigrams counts
count_vect_misogyny = CountVectorizer(ngram_range=(1, 2))
X_train = count_vect_misogyny.fit_transform(train['text'])
X_test = count_vect_misogyny.transform(test['text'])

In [6]:
# logistic regression
lg = LogisticRegression(max_iter = 10000)
param_grid = {'C': [0.1, 0.01, 0.001, 0.0001]}
grid_search_lg = GridSearchCV(lg, cv = 10, param_grid = param_grid, scoring = 'accuracy', n_jobs = 3)
grid_search_lg.fit(X_train, train['misogynous'])
print(grid_search_lg.best_estimator_)
print(grid_search_lg.best_score_)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.780650542118432


In [7]:
# random forest
forest = RandomForestClassifier()
param_grid = {'max_depth': [155, 165, 175],
             'n_estimators': [200]}
grid_search_forest = GridSearchCV(forest, cv = 10, param_grid = param_grid, scoring = 'accuracy', n_jobs = 3)
grid_search_forest.fit(X_train, train['misogynous'])
print(grid_search_forest.best_estimator_)
print(grid_search_forest.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=175, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.7867667500695024


In [8]:
# SGD classifier
sgdc = SGDClassifier()
param_grid = {'loss': ['hinge', 'log'],
             'penalty': ['l2', 'l1'],
             'alpha': [0.001, 0.0001, 0.00001]}
grid_search_sgdc = GridSearchCV(sgdc, cv = 10, param_grid = param_grid, scoring = 'accuracy', n_jobs = 3)
grid_search_sgdc.fit(X_train, train['misogynous'])
print(grid_search_sgdc.best_estimator_)
print(grid_search_sgdc.best_score_)

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
0.780650542118432




In [9]:
# SVM
svm = SVC(kernel = 'rbf', probability = True)
param_grid = {'C': [5],
             'gamma': [0.01]}
grid_search_svm = GridSearchCV(svm, cv = 10, param_grid = param_grid, scoring = 'accuracy', n_jobs = 3)
grid_search_svm.fit(X_train, train['misogynous'])
print(grid_search_svm.best_estimator_)
print(grid_search_svm.best_score_)

SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.7892688351403948


In [10]:
# gradient boosting
grad_boost = GradientBoostingClassifier()
param_grid = {'n_estimators': [100],
             'max_depth': [10, 15, 20]}
grid_search_grad_boost = GridSearchCV(grad_boost, cv = 10, param_grid = param_grid, scoring = 'accuracy', n_jobs = 3)
grid_search_grad_boost.fit(X_train, train['misogynous'])
print(grid_search_grad_boost.best_estimator_)
print(grid_search_grad_boost.best_score_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
0.8028912983041423


In [11]:
# ensemble classifier with soft voting
voting = VotingClassifier(estimators=[('forest', grid_search_forest.best_estimator_),
                                      ('svm', grid_search_svm.best_estimator_),
                                      ('boost', grid_search_grad_boost.best_estimator_),
                                     ('sgd', grid_search_sgdc.best_estimator_),
                                     ('lg', grid_search_lg.best_estimator_)])
param_grid = {'voting': ['soft']}
grid_search_voting = GridSearchCV(voting, cv = 10, param_grid = param_grid, scoring = 'accuracy', n_jobs = 3)
grid_search_voting.fit(X_train, train['misogynous'])
print(grid_search_voting.best_estimator_)
print(grid_search_voting.best_score_)

VotingClassifier(estimators=[('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=175, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)
0.7964970809007507




## Test

In [13]:
y_pred = grid_search_grad_boost.predict(X_test)
print("Accuracy:", accuracy_score(test['misogynous'], y_pred))

Accuracy: 0.82


In [15]:
# save CountVectorizer and the ensemble model
with open('Project Models/BOW_model.pickle', 'wb') as f:
    pickle.dump(grid_search_grad_boost.best_estimator_, f, pickle.HIGHEST_PROTOCOL)
with open('Project Models/Count_vectorizer.pickle', 'wb') as f:
    pickle.dump(count_vect_misogyny, f, pickle.HIGHEST_PROTOCOL)