### Import libraries

In [30]:
import re
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn import preprocessing
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split

### Load dataset

In [29]:
# Load dataset
df = pd.read_csv("tweets.csv")

# Print first 5 rows of the data.
df.head()
df.sample

<bound method NDFrame.sample of                                                   Tweet    Text Label
0     .omg why are poc wearing fugly blue contacts s...  Non-Bullying
1     .Sorry but most of the runners popular right n...  Non-Bullying
2     .those jeans are hideous, and I?m afraid he?s ...  Non-Bullying
3     .I had to dress up for a presentation in class...  Non-Bullying
4     .Am I the only one who thinks justin bieber is...  Non-Bullying
...                                                 ...           ...
1060  No we are not, But you are a race baiting libt...      Bullying
1061  you wont get anyone for this challenge., after...      Bullying
1062  I will follow you if you are not a libtard,Mus...      Bullying
1063  michaelianblack Ur a child, an ostrich w/ your...      Bullying
1064  FoxNews. not to all the ppl I know that live t...      Bullying

[1065 rows x 2 columns]>

### Data analysis

In [3]:
# Print shape of dataset.
print('Dataset shape is ', df.shape)

Dataset shape is  (1065, 2)


### Data preprocessing

In [52]:
def preprocess_tweet(tweet):
    # Remove words other than alphabets.
    row = re.sub("[^A-Za-z ]", "", tweet).lower()
    
    # Tokenize words.
    words = word_tokenize(row)

    # Remove stop words.
    english_stops = set(stopwords.words('english'))

    # Remove un-necessary words.
    characters_to_remove = ["''",'``',"rt","https","’","“","”","\u200b","--","n't","'s","...","//t.c" ]
    clean_words = [word for word in words if word not in english_stops and word not in characters_to_remove]

    # Lematise words.
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma_list = [wordnet_lemmatizer.lemmatize(word) for word in clean_words]

    return " ".join(lemma_list)

df['Processed_Tweet'] = df['Tweet'].map(preprocess_tweet)

df.head()
df.sample

<bound method NDFrame.sample of                                                   Tweet    Text Label  \
0     .omg why are poc wearing fugly blue contacts s...  Non-Bullying   
1     .Sorry but most of the runners popular right n...  Non-Bullying   
2     .those jeans are hideous, and I?m afraid he?s ...  Non-Bullying   
3     .I had to dress up for a presentation in class...  Non-Bullying   
4     .Am I the only one who thinks justin bieber is...  Non-Bullying   
...                                                 ...           ...   
1060  No we are not, But you are a race baiting libt...      Bullying   
1061  you wont get anyone for this challenge., after...      Bullying   
1062  I will follow you if you are not a libtard,Mus...      Bullying   
1063  michaelianblack Ur a child, an ostrich w/ your...      Bullying   
1064  FoxNews. not to all the ppl I know that live t...      Bullying   

                                        Processed_Tweet  
0     omg poc wearing fugly blue 

### Feature extraction

In [5]:
# Bag of word
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(df['Processed_Tweet']).toarray()

# Label encode
le = preprocessing.LabelEncoder()
y = le.fit_transform(df['Text Label'])

### Data splitting

In [43]:
# Split dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print('X Train', X_train[0:5])
print('Y Train', y_train[0:5])

X Train [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Y Train [1 0 1 1 0]


In [7]:
print('No. of rows of training set is ', X_train.shape[0])
print('No. of rows of training set is ', X_test.shape[0])

No. of rows of training set is  798
No. of rows of training set is  267


### Model

In [13]:
# Logistic Regression.
linear_regression_classifer = LogisticRegression(random_state=0)

# Train classifier.
linear_regression_classifer.fit(X_train, y_train)

# Predict on train set.
y_train_pred = linear_regression_classifer.predict(X_train)

# Predict on test set.
y_test_pred = linear_regression_classifer.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('Logistic regression train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('Logistic regression test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

Logistic regression train set result:
Accuracy 93.0
Re-call 0.97
Precision 0.92

Logistic regression test set result:
Accuracy 73.0
Re-call 0.81
Precision 0.77


In [14]:
# SVM Regression.
svm_ovo_classifer = svm.SVC(decision_function_shape='ovo')

# Train classifier.
svm_ovo_classifer.fit(X_train, y_train)

# Predict on train set.
y_train_pred = svm_ovo_classifer.predict(X_train)

# Predict on test set.
y_test_pred = svm_ovo_classifer.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('SVM train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('SVM test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

SVM train set result:
Accuracy 93.0
Re-call 0.97
Precision 0.92

SVM test set result:
Accuracy 74.0
Re-call 0.9
Precision 0.75


In [9]:
# Naive bayes Regression.
naive_bayes_classifier = GaussianNB()

# Train classifier.
naive_bayes_classifier.fit(X_train, y_train)

# Predict on train set.
y_train_pred = naive_bayes_classifier.predict(X_train)

# Predict on test set.
y_test_pred = naive_bayes_classifier.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('Naive bayes train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('Naive bayes test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

Naive bayes train set result:
Accuracy 86.0
Re-call 0.76
Precision 1.0

Naive bayes test set result:
Accuracy 59.0
Re-call 0.53
Precision 0.75


In [15]:
# Decision tree Regression.
decision_tree_regression_classifer = tree.DecisionTreeClassifier()

# Train classifier.
decision_tree_regression_classifer.fit(X_train, y_train)

# Predict on train set.
y_train_pred = decision_tree_regression_classifer.predict(X_train)

# Predict on test set.
y_test_pred = decision_tree_regression_classifer.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('Decision tree train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('Decision tree test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

Decision tree train set result:
Accuracy 99.0
Re-call 1.0
Precision 0.99

Decision tree test set result:
Accuracy 72.0
Re-call 0.7
Precision 0.83


In [46]:
# Random forest classifier.
random_forest_classifer = RandomForestClassifier(max_depth=2, random_state=0)

# Train classifier.
random_forest_classifer.fit(X_train, y_train)

# Predict on train set.
y_train_pred = random_forest_classifer.predict(X_train)

# Predict on test set.
y_test_pred = random_forest_classifer.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('Random Forest Classifier train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('Random Forest Classifier test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

Random Forest Classifier train set result:
Accuracy 59.0
Re-call 1.0
Precision 0.59

Random Forest Classifier test set result:
Accuracy 63.0
Re-call 1.0
Precision 0.63


In [12]:
#  Gradient Boosting classifier.
Gradient_Boost_Classifier = GradientBoostingClassifier(random_state=0)

# Train classifier.
Gradient_Boost_Classifier.fit(X_train, y_train)

# Predict on train set.
y_train_pred = Gradient_Boost_Classifier.predict(X_train)

# Predict on test set.
y_test_pred = Gradient_Boost_Classifier.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('Gradient Boosting Classifier train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('Gradient Boosting Classifier test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

Gradient Boosting Classifier train set result:
Accuracy 85.0
Re-call 0.95
Precision 0.82

Gradient Boosting Classifier test set result:
Accuracy 73.0
Re-call 0.84
Precision 0.76


In [51]:
#  Grid search CV .
parameters = {}

Grid_Search_CV = GridSearchCV(GradientBoostingClassifier(), parameters)

# Train classifier.
Grid_Search_CV.fit(X_train, y_train)

# Predict on train set.
y_train_pred = Grid_Search_CV.predict(X_train)

# Predict on test set.
y_test_pred = Grid_Search_CV.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('Grid Search CV train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('Grid Search CV test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

Grid Search CV train set result:
Accuracy 85.0
Re-call 0.95
Precision 0.82

Grid Search CV test set result:
Accuracy 73.0
Re-call 0.85
Precision 0.75
