<a href="https://colab.research.google.com/github/DogaSahin/Twitter_Sentiment_Analysis/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training Model & Preparing a proper train/test dataset

Importing necesary libraries

In [22]:
!pip install nltk




In [13]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
from matplotlib import rcParams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re
import string

import warnings
warnings.filterwarnings("ignore")

Getting a labeled data

In [4]:
data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", engine="python")
data.columns = ["label", "time", "date", "query", "username", "text"]
data.head()


Unnamed: 0,label,time,date,query,username,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [5]:
data.columns
print(len(data))
data.shape
data.info()
data.dtypes


1599999
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   label     1599999 non-null  int64 
 1   time      1599999 non-null  int64 
 2   date      1599999 non-null  object
 3   query     1599999 non-null  object
 4   username  1599999 non-null  object
 5   text      1599999 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


label        int64
time         int64
date        object
query       object
username    object
text        object
dtype: object

Making a more meaningfull dataset

In [7]:
# Selecting the text and label coloumn Assigning 1 to Positive sentment 4

data=data[['text','label']]
data['label'][data['label']==4]=1

# Separating positive and negative tweets
positive_data = data[data['label'] == 1]
negative_data = data[data['label'] == 0]

# taking half of the dataset for try
positive_data = positive_data.iloc[:int(80000)]
negative_data = negative_data.iloc[:int(80000)]

# Combining positive and negative tweets & making all tweets lowercase so we can work better
data = pd.concat([positive_data, negative_data])
data['text']=data['text'].str.lower()

data['text'].head()
data.tail()

Unnamed: 0,text,label
79995,@itsaroy they hurt my feelings when they don't...,0
79996,excited that josh will be back in erie in an h...,0
79997,plans of going to the club have been dashed. n...,0
79998,i hate car washing,0
79999,"cannot actually keep my eyes open.., i feel li...",0


Cleaning and removing punctuations


In [16]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
data['text']= data['text'].apply(lambda x: cleaning_punctuations(x))
data['text'].head()

799999            i love health4uandpets u guys r the best 
800000    im meeting up with one of my besties tonight c...
800001    darealsunisakim thanks for the twitter add sun...
800002    being sick can be really cheap when it hurts t...
800003       lovesbrooklyn2 he has that effect on everyone 
Name: text, dtype: object

Removing email & URL's


In [17]:
def cleaning_email(data):
    return re.sub('@[^\s]+', ' ', data)

data['text']= data['text'].apply(lambda x: cleaning_email(x))

def cleaning_URLs(data):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',data)

data['text'] = data['text'].apply(lambda x: cleaning_URLs(x))
data['text'].head()

799999            i love health4uandpets u guys r the best 
800000    im meeting up with one of my besties tonight c...
800001    darealsunisakim thanks for the twitter add sun...
800002    being sick can be really cheap when it hurts t...
800003       lovesbrooklyn2 he has that effect on everyone 
Name: text, dtype: object

Cleaning and removing Numeric numbers

In [18]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

data['text'] = data['text'].apply(lambda x: cleaning_numbers(x))
data['text'].head()

799999             i love healthuandpets u guys r the best 
800000    im meeting up with one of my besties tonight c...
800001    darealsunisakim thanks for the twitter add sun...
800002    being sick can be really cheap when it hurts t...
800003        lovesbrooklyn he has that effect on everyone 
Name: text, dtype: object

Cleaning stopwords from the dataset

In [27]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
{'but', 'are', 'now', 'what', "should've", "she's", 'here', 't', 'them', 'each', 'doing', 'further', "isn't", 'there', 'against', "it's", 'will', "you're", 'over', 'ain', 'yourself', 'same', "you'd", 're', "doesn't", "mustn't", "shan't", 'why', 'themselves', 'o', "wouldn't", 's', 'a', 'up', "aren't", 'hadn', 'most', 'some', "weren't", 'about', 'other', 'very', 'shan', 'with', 'after', 'she', "you've", 'mightn', 'how', 'don', 'won', 'before', 'yourselves', 'more', 'than', 'for', "didn't", 'if', 'again', 'didn', 'hasn', "needn't", 'then', 'i', 'it', 'mustn', 'too', 'shouldn', 'or', 'myself', 'him', "haven't", 'isn', 'does', 'hers', 'own', 'have', 'on', 'can', 'this', 'was', "won't", 'they', 'his', 'weren', 'her', 've', 'your', 'me', 'haven', 'd', 'herself', 'under', 'once', 'yours', 'we', 'when', 'between', 'himself', 'my', 'down', 'above', 'do', 'by', 'through', 'couldn

In [28]:
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
data['text'] = data['text'].apply(lambda text: cleaning_stopwords(text))
data['text'].head()

799999                    love healthuandpets u guys r best
800000    im meeting one besties tonight cant wait girl ...
800001    darealsunisakim thanks twitter add sunisa got ...
800002    sick really cheap hurts much eat real food plu...
800003                        lovesbrooklyn effect everyone
Name: text, dtype: object