In [1]:
#Import libraries

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from string import punctuation
import pandas as pd

In [2]:
#Clean texts function

def clean(text):
    
    stop_words = stopwords.words('english') + list(punctuation)
    text = text.lower().split()
    text = [word for word in text if not word in stop_words]
    
    return ' '.join(text)

In [3]:
def accuracy(estimator):
    """
    Returns the average accuracy score
    of a model from 10-fold
    cross-validation
    """
    
    accuracies = cross_val_score(estimator=estimator, X=X, y=y, scoring='accuracy', cv=10)
    return accuracies.mean() * 100

In [5]:
#Import dataset

dataset = pd.read_csv('train.csv')
dataset = dataset[['text','target']]
test = pd.read_csv('test.csv')

In [6]:
#Clean texts

train_corpus = [clean(text) for text in dataset['text']]
test_corpus = [clean(text) for text in test['text']]

In [7]:
#Create features

cv = CountVectorizer()
X = cv.fit_transform(train_corpus+test_corpus)
y = dataset['target']

X_test = X[7613: , :]
X = X[:7613, :]

In [11]:
#Random Forest classifier
forest = RandomForestClassifier(n_estimators=10, random_state=0, n_jobs=-1)
forest.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [12]:
accuracy(forest)

63.364741449658055

In [13]:
#Extra trees
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=0, n_jobs=-1)
extra_trees.fit(X,y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=0, verbose=0,
                     warm_start=False)

In [14]:
accuracy(extra_trees)

60.8669429302205

In [15]:
#Adaboost with extra trees
ada = AdaBoostClassifier(base_estimator=extra_trees, n_estimators=100, random_state=0)
ada.fit(X,y)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=ExtraTreesClassifier(bootstrap=False,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=100,
                                               

In [16]:
accuracy(ada)

64.25699501579349

In [17]:
#Gradient boost
grad = GradientBoostingClassifier(random_state=0)
grad.fit(X,y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [18]:
accuracy(grad)

62.31301041618794

In [21]:
#XGBoost
xg = XGBClassifier(learning_rate=0.01)
xg.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [22]:
accuracy(xg)

64.57452052329913

In [23]:
#Logistic Regression

logistic_regressor = LogisticRegression()
logistic_regressor.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
accuracy(logistic_regressor)



66.1620543327152

In [25]:
#Decision Tree Classifier

tree = DecisionTreeClassifier(max_depth = 10, random_state=0)
tree.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [26]:
accuracy(tree)

64.41697086716466

In [27]:
#Support Vector Machine(SVM)

svm = SVC(kernel='linear')
svm.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [28]:
accuracy(svm)

61.19623315780749

In [29]:
#Kernel SVM

kernel_svm = SVC(kernel='rbf', random_state=0, gamma=0.001, C=25)
kernel_svm.fit(X, y)

SVC(C=25, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [30]:
accuracy(kernel_svm)

69.04001807517372