In [44]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [41]:
df = pd.read_csv('data/vectorized_data.csv')
df.head()

Unnamed: 0,sentence,hate,category,token,vec
0,অসীম সৌন্দর্য বোঝা স্মার্ট বোঝা মান মানুষ রাসু...,0,religion,"['অসীম', 'সৌন্দর্য', 'বোঝা', 'স্মার্ট', 'বোঝা'...",[-0.01497428 0.08873386 0.02271434 0.056933...
1,মাগি মাগি মর যা,1,crime,"['মাগি', 'মাগি', 'মর', 'যা']",[-2.24201421e-02 2.10984946e-01 3.13066845e-...
2,রিফাত মারা,0,crime,"['রিফাত', 'মারা']",[-3.24658379e-02 2.21859217e-01 3.75445386e-...
3,প্রিয় ভক্ত মিজানু রহমান,0,religion,"['প্রিয়', 'ভক্ত', 'মিজানু', 'রহমান']",[-0.01089375 0.10535828 0.01994424 0.063042...
4,তাহা জন সিন্স শালা ফালতু,1,religion,"['তাহা', 'জন', 'সিন্স', 'শালা', 'ফালতু']",[-0.02635221 0.20205662 0.03821389 0.128290...


## Convert the string representations back to NumPy arrays

In [45]:
df['vec'] = df['vec'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))

In [48]:
X = df['vec'].to_list()
y = df['hate'].to_list()

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [51]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,5,10,50],
            'kernel': ['rbf','linear', 'poly']
        }  
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params' : {
            'criterion': ['gini', 'entropy']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10,50]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10,50]
        }
    }
}

In [52]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.779118,"{'C': 50, 'kernel': 'linear'}"
1,decision_tree,0.778757,{'criterion': 'gini'}
2,random_forest,0.773699,{'n_estimators': 50}
3,logistic_regression,0.786561,{'C': 50}


In [56]:
from sklearn.metrics import accuracy_score

In [53]:
clf = SVC(gamma='auto', C=100, kernel='linear')

In [54]:
clf.fit(X_train, y_train)

SVC(C=100, gamma='auto', kernel='linear')

In [55]:
y_pred = clf.predict(X_test)

In [57]:
accuracy_score(y_test, y_pred)

0.7947976878612717

In [58]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [100, 200],
            'kernel': ['linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [100,200]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [100,200]
        }
    }
}

In [59]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.790535,"{'C': 200, 'kernel': 'linear'}"
1,random_forest,0.781069,{'n_estimators': 200}
2,logistic_regression,0.794653,{'C': 200}


In [60]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [400],
            'kernel': ['linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [400]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [400]
        }
    }
}

In [61]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.795087,"{'C': 400, 'kernel': 'linear'}"
1,random_forest,0.780491,{'n_estimators': 400}
2,logistic_regression,0.796604,{'C': 400}


In [62]:
model_params = {
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params' : {
            'criterion': ['gini', 'entropy']
        }
    }
}

In [63]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,decision_tree,0.698266,{'criterion': 'entropy'}
