In [1]:
# Importing Libraries

import numpy as np # For Handling Arrays in Python
import pandas as pd # For Data Importing and Handling

# For Graphing
import matplotlib.pyplot as plt
import seaborn as sns

import os # For tracking files and folders

# scikit-learn imports
from sklearn.pipeline import Pipeline # For running tasks simultaneously

# For Cross Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer # For Feature Extraction from Text

# Machine Learning Algorithms
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelEncoder # For Encoding Text Data
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score # Performance Metrics for the Algorithm

# Walking through thr directory
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data.csv
./Shayma Report v2.docx
./Twitter Mixed NLP.ipynb
./.ipynb_checkpoints\Twitter Mixed NLP-checkpoint.ipynb


In [2]:
# Importing the dataset
# Data from: https://data.world/data-society/twitter-user-data
# Manually Labelled and Reduced to 250 Instances
data = pd.read_csv('data.csv').drop(columns = 'description')

In [3]:
# Viewing the dataset for an idea for the features and appearance
data.head(5)

Unnamed: 0,text,label
0,Robbie E Responds To Critics After Win Against...,sports
1,���It felt like they were my friends and I was...,personal
2,i absolutely adore when louis starts the songs...,personal
3,Hi @JordanSpieth - Looking at the url - do you...,sports
4,Watching Neighbours on Sky+ catching up with t...,entertainment


In [4]:
# Checking the dataset shape for number of rows and columns i.e., instances and features
data.shape

(249, 2)

In [5]:
# Dropping duplicate data instances from the dataset to increase data quality
data.drop_duplicates(inplace = True)

In [6]:
# Checking for any null data instances in the dataset
data.isnull().sum()

text     0
label    0
dtype: int64

In [7]:
data = data.dropna()

In [8]:
# Viewing the dataset for an idea for the features and appearance
# Monitoring the changes
data.head(5)

Unnamed: 0,text,label
0,Robbie E Responds To Critics After Win Against...,sports
1,���It felt like they were my friends and I was...,personal
2,i absolutely adore when louis starts the songs...,personal
3,Hi @JordanSpieth - Looking at the url - do you...,sports
4,Watching Neighbours on Sky+ catching up with t...,entertainment


In [9]:
# Monitoring the changes
data.shape

(249, 2)

In [10]:
# Statistically analysing the dataset's features
data.describe()

Unnamed: 0,text,label
count,249,249
unique,249,18
top,@karen_hauer You frightened me when you hissed...,personal
freq,1,58


In [11]:
# The metadata for the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 249 entries, 0 to 248
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    249 non-null    object
 1   label   249 non-null    object
dtypes: object(2)
memory usage: 5.8+ KB


In [12]:
# Splitting the dataset's features and outputs
X = data.drop('label', axis = 1)
y = data['label']

In [13]:
X.head(5)

Unnamed: 0,text
0,Robbie E Responds To Critics After Win Against...
1,���It felt like they were my friends and I was...
2,i absolutely adore when louis starts the songs...
3,Hi @JordanSpieth - Looking at the url - do you...
4,Watching Neighbours on Sky+ catching up with t...


In [14]:
y.head(5)

0           sports
1         personal
2         personal
3           sports
4    entertainment
Name: label, dtype: object

In [15]:
# Splitting the dataset into Training and testing datasets for cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [16]:
# Extracting features from the text in the dataset
count_vect = CountVectorizer()
X_train_counts_text = count_vect.fit_transform(X_train['text'])
print(X_train_counts_text.shape)

(174, 1299)


In [17]:
# Converting the extracted features into a matrix of TF-IDF features
tfidf_transformer = TfidfTransformer()
X_train_tfidf_text = tfidf_transformer.fit_transform(X_train_counts_text)
print(X_train_tfidf_text.shape)

(174, 1299)


In [18]:
# Initializing a Pipeline for Simultaneous Execution
mnb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('mnb', MultinomialNB())])
mnb = mnb.fit(X_train['text'], y_train)

In [19]:
# Prediction from the Machine Learning algorithm
mnb_pred = mnb.predict(X_test['text']) # Returns the predicted output
mnb_pred_proba = mnb.predict_proba(X_test['text'])[:, 1] # Returns the probability estimates for the features in the test set

In [20]:
svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('svm', SVC(probability=True))])
svm = svm.fit(X_train['text'], y_train)

In [21]:
svm_pred = svm.predict(X_test['text']) # Returns the predicted output
svm_pred_proba = svm.predict_proba(X_test['text'])[:, 1] # Returns the probability estimates for the features in the test set

In [22]:
mlp = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('mlp', MLPClassifier())])
mlp = svm.fit(X_train['text'], y_train)

In [23]:
mlp_pred = svm.predict(X_test['text']) # Returns the predicted output
mlp_pred_proba = svm.predict_proba(X_test['text'])[:, 1] # Returns the probability estimates for the features in the test set

In [24]:
print("For Multinomial NB: ", accuracy_score(y_test, mnb_pred))
print("For SVC: ", accuracy_score(y_test, svm_pred))
print("For MLP: ", accuracy_score(y_test, mlp_pred))

For Multinomial NB:  0.30666666666666664
For SVC:  0.29333333333333333
For MLP:  0.29333333333333333


In [25]:
print("For Multinomial NB: ", classification_report(y_test, mnb_pred, zero_division=1))
print("For SVM: ", classification_report(y_test, svm_pred, zero_division=1))
print("For MLP: ", classification_report(y_test, mlp_pred, zero_division=1))

For Multinomial NB:                 precision    recall  f1-score   support

        anime       1.00      0.00      0.00         1
     business       1.00      0.00      0.00         1
entertainment       0.19      0.57      0.28        14
      fitness       1.00      0.00      0.00         1
         food       1.00      0.00      0.00         3
       health       1.00      0.00      0.00         1
        music       1.00      0.00      0.00         4
       nature       1.00      0.00      0.00         2
         news       1.00      0.25      0.40         4
     personal       0.41      0.57      0.48        21
     politics       1.00      0.18      0.31        11
     religion       1.00      0.00      0.00         1
      science       1.00      0.00      0.00         1
       sports       1.00      0.00      0.00         6
         tech       1.00      0.00      0.00         4

     accuracy                           0.31        75
    macro avg       0.91      0.10      0.

In [26]:
print(X_train.shape, y_train.shape)

(174, 1) (174,)


In [27]:
mnb_scores = cross_val_score(mnb, X_train['text'].values.reshape(174), y_train.values.reshape(174), cv=2)
svm_scores = cross_val_score(svm, X_train['text'].values.reshape(174), y_train.values.reshape(174), cv=2)
mlp_scores = cross_val_score(mlp, X_train['text'].values.reshape(174), y_train.values.reshape(174), cv=2)



In [28]:
print("For Multinomial NB: ", mnb_scores)
print("For SVM: ", svm_scores)
print("For Multi-layer Perceptron: ", mlp_scores)

For Multinomial NB:  [0.34482759 0.4137931 ]
For SVM:  [0.31034483 0.3908046 ]
For Multi-layer Perceptron:  [0.31034483 0.3908046 ]
