In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
def load_df(fp):
	df= pd.read_csv(fp, names=['id', 'ip', 'label', 'text']).set_index('id')
	df.drop_duplicates(inplace=True)
	df.dropna(inplace=True)
	df.info()
	return df

In [5]:
df_trn= load_df('/content/twitter_training.csv')
print()
df_tst= load_df('/content/twitter_validation.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 70958 entries, 2401 to 9200
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ip      70958 non-null  object
 1   label   70958 non-null  object
 2   text    70958 non-null  object
dtypes: object(3)
memory usage: 2.2+ MB

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 3364 to 6960
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ip      1000 non-null   object
 1   label   1000 non-null   object
 2   text    1000 non-null   object
dtypes: object(3)
memory usage: 31.2+ KB


In [6]:
df_trn.sample(5)


Unnamed: 0_level_0,ip,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8143,Microsoft,Positive,I was using NextStep 2. 1. Maybe Good lord did...
6767,Fortnite,Negative,going to actually make me a TwitLonger by expl...
8469,NBA2K,Neutral,I definitely do myPlayer mode
10298,PlayerUnknownsBattlegrounds(PUBG),Positive,omg I'm so excited to see dk play pubg
9099,Nvidia,Negative,The original Nvidia cards launch tomorrow and ...


In [7]:
df_trn['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Negative,21565
Positive,19549
Neutral,17398
Irrelevant,12446


In [8]:
df_trn= df_trn[df_trn['label']!='Irrelevant']
df_trn.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58512 entries, 2401 to 9200
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ip      58512 non-null  object
 1   label   58512 non-null  object
 2   text    58512 non-null  object
dtypes: object(3)
memory usage: 1.8+ MB


In [9]:
df_tst.sample(5)


Unnamed: 0_level_0,ip,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3764,Cyberpunk2077,Neutral,Free Cyberpunk 2077 Stick Bug over here\nyoutu...
2049,CallOfDuty,Negative,When I search for a game and a map comes up th...
5285,Hearthstone,Negative,Too greedy?\n#Hearthstone pic.twitter.com/caa4...
279,Amazon,Neutral,@NyxJacob Thanks for entering Grand Summoners ...
5121,GrandTheftAuto(GTA),Negative,@RockstarGames how the hell is gta online STIL...


In [10]:
df_tst['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Neutral,285
Positive,277
Negative,266
Irrelevant,172


In [11]:
df_tst= df_tst[df_tst['label']!='Irrelevant']
df_tst.info()

<class 'pandas.core.frame.DataFrame'>
Index: 828 entries, 352 to 6960
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ip      828 non-null    object
 1   label   828 non-null    object
 2   text    828 non-null    object
dtypes: object(3)
memory usage: 25.9+ KB


In [12]:
nlp= spacy.load('en_core_web_sm')

In [13]:
def process_text(s):
	out= []
	for token in nlp(s):
		if not token.is_stop and not token.is_punct:
			out.append(token.lemma_)
	return ' '.join(out)

In [14]:
df_trn['fltr']= df_trn['text'].apply(process_text)


In [15]:
df_trn.sample(5)


Unnamed: 0_level_0,ip,label,text,fltr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7114,johnson&johnson,Neutral,Johnson & Johnson Halts the Talc on Baby Powde...,Johnson Johnson Halts Talc Baby Powder Company...
7851,MaddenNFL,Negative,Has @God_Son80 and you were playing Outside Th...,@God_Son80 play outside Yard filthy
6689,Fortnite,Neutral,I,
2593,Borderlands,Positive,My favourite games I could guess?.. Also 1. Mi...,favourite game guess 1 Minecraft t beat 2 bloo...
2975,Dota2,Negative,Biggest scam story,big scam story


In [16]:
df_tst['fltr']= df_tst['text'].apply(process_text)

In [17]:
df_tst.sample(5)

Unnamed: 0_level_0,ip,label,text,fltr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9608,PlayStation5(PS5),Negative,God when 🥺,God 🥺
5122,GrandTheftAuto(GTA),Neutral,People who killed Michael or Trevor at the end...,People kill Michael Trevor end gta probably ho...
6091,FIFA,Neutral,So #Maria was playing FIFA today.,Maria play FIFA today
13039,Xbox(Xseries),Positive,Damn! Microsoft is going all out in showing of...,damn Microsoft go show new Xbox Series X Sony ...
1694,CallOfDutyBlackopsColdWar,Positive,FUCKING FINALLLLYYYYYYYYY,fucking finallllyyyyyyyyy


In [18]:
enc= LabelEncoder()
y_trn= enc.fit_transform(df_trn['label'])
y_tst= enc.transform(df_tst['label'])

In [19]:
vct= TfidfVectorizer()
X_trn= vct.fit_transform(df_trn['fltr'])
X_tst= vct.transform(df_tst['fltr'])

In [20]:
def model_report(model, verbose=True):
	model.fit(X_trn, y_trn)

	y_pred=   model.predict(X_tst)
	trnScore= model.score(X_trn, y_trn)
	tstScore= model.score(X_tst, y_tst)
	cm= confusion_matrix(y_tst, y_pred)
	cr= classification_report(y_tst, y_pred)

	if verbose:
		print('Train Score: %f'%trnScore)
		print('Test Score:  %f'%tstScore)
		print('Classification Report:\n', cr)
		ConfusionMatrixDisplay(cm).plot()
		plt.show()
		print()

	return {
		'trn': trnScore,
		'tst': tstScore,
		'cm':  cm,
		'cr':  cr,
	}


In [None]:
models_dict= {
	'LogisticRegression':     LogisticRegression(max_iter=10_000),
	'Support Vector':         SVC(),
	'KNeighborsCLassifier':   KNeighborsClassifier(),
	'DecisionTreeClassifier': DecisionTreeClassifier(),
	'RandomForestClassifier': RandomForestClassifier(),
	'BaggingClassifier':      BaggingClassifier(),
	'ExtraTreesClassifier':   ExtraTreesClassifier(),
	'AdaBoostClassifier':     AdaBoostClassifier(),
	'XGBClassifier':          XGBClassifier(),
	'CatBoostClassifier':     CatBoostClassifier(verbose=False),
	'LGBMClassifier':         LGBMClassifier(),
}
models= [{'name':k, 'obj':v} for k,v in models_dict.items()]

i= 0
for model in models:
	print("Evaluating %s..."%model['name'])
	print("%d/%d models"%(i, len(models)), end='\r')
	model.update(model_report(model['obj'], verbose=False))
	i+= 1
print("%d/%d models evaluated"%(i, len(models)))
print("done")

Evaluating LogisticRegression...
Evaluating Support Vector...


In [None]:
pd.DataFrame({
	'Algorithm':           [model['name'] for model in models],
	'Train Score':         [model['trn']  for model in models],
	'Test Score':          [model['tst']  for model in models],
}).set_index('Algorithm').sort_values(by='Test Score', ascending=False)


In [None]:
for model in sorted(models, key=lambda x: x['tst'], reverse=True):
	ConfusionMatrixDisplay(model['cm']).plot()
	plt.title(model['name'])