In [1]:
# imports
import numpy as np
import pandas as pd
import tensorflow as tf




In [2]:
# open training dataset
Data = pd.read_csv('./Dataset/twitter_training.csv')

# see snapshot of the dataset
Data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [3]:
# know our Dataset's size
Data.shape

(74681, 4)

In [4]:
# know about our Dataset
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
# know our Data columns
Data.columns

Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')

In [6]:
# preprocess our dataset for training

# changing Column names
Data.rename(columns={"Borderlands": "Entity", "Positive": "Sentiment",
                     "im getting on borderlands and i will murder you all ,": "Text"}, inplace=True)

# see changes
Data.head()

Unnamed: 0,2401,Entity,Sentiment,Text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [7]:
# find any duplicates in our Dataset
Data.duplicated().sum()

2700

In [8]:
# remove these duplicated entries
Data.drop_duplicates(inplace=True)

# see the changes
Data.duplicated().sum()

0

In [9]:
# join Entity values to that of the Text values to have just one entry
Data['Text'] = Data['Entity'].str.cat(Data['Text'], sep=" ")

# see the changes
Data.head()

Unnamed: 0,2401,Entity,Sentiment,Text
0,2401,Borderlands,Positive,Borderlands I am coming to the borders and I w...
1,2401,Borderlands,Positive,Borderlands im getting on borderlands and i wi...
2,2401,Borderlands,Positive,Borderlands im coming on borderlands and i wil...
3,2401,Borderlands,Positive,Borderlands im getting on borderlands 2 and i ...
4,2401,Borderlands,Positive,Borderlands im getting into borderlands and i ...


In [10]:
# lets determine unique values in our Dataset
Data.nunique()

2401         12447
Entity          32
Sentiment        4
Text         70525
dtype: int64

In [11]:
# Lets see the Entity groups
Data.groupby(Data['Entity']).count()

Unnamed: 0_level_0,2401,Sentiment,Text
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amazon,2264,2264,2249
ApexLegends,2289,2289,2278
AssassinsCreed,2160,2160,2156
Battlefield,2267,2267,2255
Borderlands,2210,2210,2205
CS-GO,2207,2207,2195
CallOfDuty,2322,2322,2314
CallOfDutyBlackopsColdWar,2261,2261,2242
Cyberpunk2077,2193,2193,2175
Dota2,2229,2229,2225


In [12]:
# Lets see the Sentiment groups
Data.groupby(Data['Sentiment']).count()

Unnamed: 0_level_0,2401,Entity,Text
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Irrelevant,12584,12584,12537
Negative,21787,21787,21698
Neutral,17800,17800,17708
Positive,19810,19810,19712


In [13]:
# drop irrelevant coloumns with their entries
Data.drop(columns=["2401", "Entity"], inplace=True)

# see the changes
Data.head()

Unnamed: 0,Sentiment,Text
0,Positive,Borderlands I am coming to the borders and I w...
1,Positive,Borderlands im getting on borderlands and i wi...
2,Positive,Borderlands im coming on borderlands and i wil...
3,Positive,Borderlands im getting on borderlands 2 and i ...
4,Positive,Borderlands im getting into borderlands and i ...


In [14]:
# change variable
testData = Data

# see the change
testData.head()

Unnamed: 0,Sentiment,Text
0,Positive,Borderlands I am coming to the borders and I w...
1,Positive,Borderlands im getting on borderlands and i wi...
2,Positive,Borderlands im coming on borderlands and i wil...
3,Positive,Borderlands im getting on borderlands 2 and i ...
4,Positive,Borderlands im getting into borderlands and i ...


In [15]:
# open validation dataset
valData = pd.read_csv("./Dataset/twitter_validation.csv")

# see a snapshot of the dataset
valData.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [16]:
# know our valDataset's size
valData.shape

(999, 4)

In [17]:
# know about our validation Dataset
valData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
 #   Column                                                                                                                                                                                                                                              Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                                              --------------  ----- 
 0   3364                                                                                                                                                                                                                                                999 non-null    int64 
 1   Facebook                                                                   

In [18]:
# know our Data columns
valData.columns

Index(['3364', 'Facebook', 'Irrelevant',
       'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣'],
      dtype='object')

In [19]:
# preprocess our datasets for validation

# changing Column names
valData.rename(columns={"Facebook": "Entity", "Irrelevant": "Sentiment",
                        "I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣": "Text"},
                        inplace=True)

# see changes
valData.head()

Unnamed: 0,3364,Entity,Sentiment,Text
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [20]:
# find any duplicates in our Dataset
valData.duplicated().sum()

0

In [21]:
# join Entity values to that of the Text values to have just one entry
valData['Text'] = valData['Entity'].str.cat(valData['Text'], sep=" ")

# see the changes
valData.head()

Unnamed: 0,3364,Entity,Sentiment,Text
0,352,Amazon,Neutral,Amazon BBC News - Amazon boss Jeff Bezos rejec...
1,8312,Microsoft,Negative,Microsoft @Microsoft Why do I pay for WORD whe...
2,4371,CS-GO,Negative,CS-GO CSGO matchmaking is so full of closet ha...
3,4433,Google,Neutral,Google Now the President is slapping Americans...
4,6273,FIFA,Negative,FIFA Hi @EAHelp I’ve had Madeleine McCann in m...


In [22]:
# drop irrelevant columns with their entries
valData.drop(columns=["3364", "Entity"], inplace=True)

# see the changes
valData.head()

Unnamed: 0,Sentiment,Text
0,Neutral,Amazon BBC News - Amazon boss Jeff Bezos rejec...
1,Negative,Microsoft @Microsoft Why do I pay for WORD whe...
2,Negative,CS-GO CSGO matchmaking is so full of closet ha...
3,Neutral,Google Now the President is slapping Americans...
4,Negative,FIFA Hi @EAHelp I’ve had Madeleine McCann in m...


In [23]:
# encode Sentiment
testData['Sentiment'].replace({'Positive': 1, 'Negative': 0, 'Neutral': 2, 'Irrelevant': 3}, inplace=True)
valData['Sentiment'].replace({'Positive': 1, 'Negative': 0, 'Neutral': 2, 'Irrelevant': 3}, inplace=True)

In [24]:
# see results 1
testData.head()

Unnamed: 0,Sentiment,Text
0,1,Borderlands I am coming to the borders and I w...
1,1,Borderlands im getting on borderlands and i wi...
2,1,Borderlands im coming on borderlands and i wil...
3,1,Borderlands im getting on borderlands 2 and i ...
4,1,Borderlands im getting into borderlands and i ...


In [25]:
# see results 2
valData.head()

Unnamed: 0,Sentiment,Text
0,2,Amazon BBC News - Amazon boss Jeff Bezos rejec...
1,0,Microsoft @Microsoft Why do I pay for WORD whe...
2,0,CS-GO CSGO matchmaking is so full of closet ha...
3,2,Google Now the President is slapping Americans...
4,0,FIFA Hi @EAHelp I’ve had Madeleine McCann in m...


In [26]:
testData.shape

(71981, 2)

In [27]:
dataX = testData['Text']
dataY = testData['Sentiment']

dataX = dataX.astype(str)

In [28]:
valData.shape

(999, 2)

In [29]:
valX = valData['Text']
valY = valData['Sentiment']

valX = valX.astype(str)

In [30]:
# prepare datasets for model training

# imports
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# function tokenizes data
def preprocess(data):
    token = tokenizer(data, truncation=True, padding='max_length',max_length=128, return_tensors="tf")
    return token

  from .autonotebook import tqdm as notebook_tqdm
