### Import libraries

In [1]:
import re
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import tree
from sklearn import preprocessing
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score

### Load dataset

In [2]:
# Load dataset
df = pd.read_csv("tweets.csv")

# Print first 5 rows of the data.
df.head()

Unnamed: 0,Tweet,Text Label
0,.omg why are poc wearing fugly blue contacts s...,Non-Bullying
1,.Sorry but most of the runners popular right n...,Non-Bullying
2,".those jeans are hideous, and I?m afraid he?s ...",Non-Bullying
3,.I had to dress up for a presentation in class...,Non-Bullying
4,.Am I the only one who thinks justin bieber is...,Non-Bullying


### Data analysis

In [3]:
# Print shape of dataset.
print('Dataset shape is ', df.shape)

Dataset shape is  (1065, 2)


### Data preprocessing

In [4]:
def preprocess_tweet(tweet):
    # Remove words other than alphabets.
    row = re.sub("[^A-Za-z ]", "", tweet).lower()
    
    # Tokenize words.
    words = word_tokenize(row)

    # Remove stop words.
    english_stops = set(stopwords.words('english'))

    # Remove un-necessary words.
    characters_to_remove = ["''",'``',"rt","https","’","“","”","\u200b","--","n't","'s","...","//t.c" ]
    clean_words = [word for word in words if word not in english_stops and word not in characters_to_remove]

    # Lematise words.
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma_list = [wordnet_lemmatizer.lemmatize(word) for word in clean_words]

    return " ".join(lemma_list)

df['Processed_Tweet'] = df['Tweet'].map(preprocess_tweet)

df.head()

Unnamed: 0,Tweet,Text Label,Processed_Tweet
0,.omg why are poc wearing fugly blue contacts s...,Non-Bullying,omg poc wearing fugly blue contact please make...
1,.Sorry but most of the runners popular right n...,Non-Bullying,sorry runner popular right plain fugly
2,".those jeans are hideous, and I?m afraid he?s ...",Non-Bullying,jean hideous im afraid he bought entire collec...
3,.I had to dress up for a presentation in class...,Non-Bullying,dress presentation class today im giving serio...
4,.Am I the only one who thinks justin bieber is...,Non-Bullying,one think justin bieber fugly af


### Feature extraction

In [12]:
# Bag of word
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(df['Processed_Tweet']).toarray()

# Label encode
le = preprocessing.LabelEncoder()
y = le.fit_transform(df['Text Label'])

### Data splitting

In [13]:
# Split dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
print('No. of rows of training set is ', X_train.shape[0])
print('No. of rows of training set is ', X_test.shape[0])

No. of rows of training set is  798
No. of rows of training set is  267


### Model

In [8]:
# Logistic Regression.
linear_regression_classifer = LogisticRegression(random_state=0)

# Train classifier.
linear_regression_classifer.fit(X_train, y_train)

# Predict on train set.
y_train_pred = linear_regression_classifer.predict(X_train)

# Predict on test set.
y_test_pred = linear_regression_classifer.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('Logistic regression train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('Logistic regression test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

Logistic regression train set result:
Accuracy 93.0
Re-call 0.97
Precision 0.92

Logistic regression test set result:
Accuracy 73.0
Re-call 0.81
Precision 0.77


In [9]:
# SVM Regression.
svm_ovo_classifer = svm.SVC(decision_function_shape='ovo')

# Train classifier.
svm_ovo_classifer.fit(X_train, y_train)

# Predict on train set.
y_train_pred = svm_ovo_classifer.predict(X_train)

# Predict on test set.
y_test_pred = svm_ovo_classifer.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('SVM train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('SVM test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

SVM train set result:
Accuracy 93.0
Re-call 0.97
Precision 0.91

SVM test set result:
Accuracy 73.0
Re-call 0.89
Precision 0.74


In [10]:
# Naive bayes Regression.
naive_bayes_classifier = GaussianNB()

# Train classifier.
naive_bayes_classifier.fit(X_train, y_train)

# Predict on train set.
y_train_pred = naive_bayes_classifier.predict(X_train)

# Predict on test set.
y_test_pred = naive_bayes_classifier.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('Naive bayes train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('Naive bayes test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

Naive bayes train set result:
Accuracy 86.0
Re-call 0.76
Precision 1.0

Naive bayes test set result:
Accuracy 59.0
Re-call 0.53
Precision 0.75


In [11]:
# Decision tree Regression.
decision_tree_regression_classifer = tree.DecisionTreeClassifier()

# Train classifier.
decision_tree_regression_classifer.fit(X_train, y_train)

# Predict on train set.
y_train_pred = decision_tree_regression_classifer.predict(X_train)

# Predict on test set.
y_test_pred = decision_tree_regression_classifer.predict(X_test)

# Accuracy and other metrics.
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='binary')
train_precision = precision_score(y_train, y_train_pred, average='binary')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='binary')
test_precision = precision_score(y_test, y_test_pred, average='binary')

print('Decision tree train set result:')
print('Accuracy', round(train_accuracy, 2) * 100)
print('Re-call', round(train_recall, 2))
print('Precision', round(train_precision, 2))
print()
print('Decision tree test set result:')
print('Accuracy', round(test_accuracy, 2) * 100)
print('Re-call', round(test_recall, 2))
print('Precision', round(test_precision, 2))

Decision tree train set result:
Accuracy 99.0
Re-call 1.0
Precision 0.99

Decision tree test set result:
Accuracy 70.0
Re-call 0.69
Precision 0.81
