# Twitter Sentiment Analysis: EDA 
### Importing Libraries - Load Databases

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.collocations import *
from nltk import FreqDist
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter
from collections import defaultdict
import string
import itertools as it
import emoji
import re
import spacy
import fileinput
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
twitter_training = pd.read_csv('twitter_training.csv', sep=',', names=['Tweet ID','Entity','Sentiment','Tweet_content'])
twitter_validation = pd.read_csv('twitter_validation.csv', sep=',', names=['Tweet ID','Entity','Sentiment','Tweet_content'])

In [None]:
twitter_training["Tweet ID"] = range(1, len(twitter_training) + 1)
twitter_validation["Tweet ID"] = range(1, len(twitter_validation) + 1)

### - Exploration of each dataset

In [None]:
twitter_training.head(3)

In [None]:
twitter_training.tail(3)

In [None]:
twitter_validation.head(3)

In [None]:
twitter_validation.tail(3)

In [None]:
# remove dupicate and nan values
twitter_training.dropna(inplace=True)
twitter_training.drop_duplicates(inplace=True)

In [None]:
twitter_training.info()

In [None]:
twitter_validation.info()

In [None]:
# Find emojis in whole dataframe

# Dictionary storing emoji counts 
emoji_count = defaultdict(int)
for i in twitter_training['Tweet_content']:
    for emoji in re.findall(u'[\U0001f300-\U0001f650]|[\u2000-\u3000]', i):
        emoji_count[emoji] += 1
    
#By adding this we find more "emojis" - is there another way to find these emojis... what if we didn't find them all?
#|[\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F1E0-\U0001F1FF]|[\U00002702-\U000027B0]|[\U000024C2-\U0001F251]
print(emoji_count)

In [None]:
emoji_count = defaultdict(int)
for i in twitter_validation['Tweet_content']:
    for emoji in re.findall(u'[\U0001f300-\U0001f650]|[\u2000-\u3000]', i):
        emoji_count[emoji] += 1
    
#By adding this we find more "emojis" - is there another way to find these emojis... what if we didn't find them all?
#|[\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F1E0-\U0001F1FF]|[\U00002702-\U000027B0]|[\U000024C2-\U0001F251]
print(emoji_count)

In [None]:
emoji_count = defaultdict(int)
for i in twitter_validation['Tweet_content']:
    for emoji in re.findall(u'[\U0001f300-\U0001f650]|[\u2000-\u3000]|[\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F1E0-\U0001F1FF]|[\U00002702-\U000027B0]|[\U000024C2-\U0001F251]', i):
        emoji_count[emoji] += 1
    
#By adding this we find more "emojis" - is there another way to find these emojis... what if we didn't find them all?
#|[\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F1E0-\U0001F1FF]|[\U00002702-\U000027B0]|[\U000024C2-\U0001F251]
print(emoji_count)

In [None]:
#remove urls & special characters
def remove_urls(text):
    """Berilgan matndan URL larini o'chiradi"""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_emojis(text):
    """Berilgan matndan emojilarni o'chiradi"""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emojilar
                               u"\U0001F300-\U0001F5FF"  # simvollar va diagrammalar
                               u"\U0001F680-\U0001F6FF"  # transport va turli joylar
                               u"\U0001F1E0-\U0001F1FF"  # davlat bayroqlari
                               u"\U00002702-\U000027B0"  # dingbats
                               u"\U000024C2-\U0001F251"  # alamatchilik belgilari
                               u"\U0001f300-\U0001f650"  
                               u"\u2000-\u3000" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
#def remove_and,or,a,the...

In [None]:
twitter_training['Tweet_content'] = twitter_training['Tweet_content'].apply(lambda x: remove_emojis(x))
twitter_training['Tweet_content'] = twitter_training['Tweet_content'].apply(lambda x: remove_urls(x))

twitter_validation['Tweet_content'] = twitter_validation['Tweet_content'].apply(lambda x: remove_emojis(x))
twitter_validation['Tweet_content'] = twitter_validation['Tweet_content'].apply(lambda x: remove_urls(x))

twitter_training['text_lens']=twitter_training['Tweet_content'].apply(lambda x: len(x))
twitter_validation['text_lens']=twitter_validation['Tweet_content'].apply(lambda x: len(x))

In [None]:
# Testing we indeed removed the emojis:

emoji_count = defaultdict(int)
for i in twitter_training['Tweet_content']:
    for emoji in re.findall(u'[\U0001f300-\U0001f650]|[\u2000-\u3000]', i):
        emoji_count[emoji] += 1

print(emoji_count)

In [None]:
emoji_count = defaultdict(int)
for i in twitter_validation['Tweet_content']:
    for emoji in re.findall(u'[\U0001f300-\U0001f650]|[\u2000-\u3000]', i):
        emoji_count[emoji] += 1

print(emoji_count)

In [None]:
#Count information per category
data1=twitter_training.groupby(by=["Entity","Sentiment"]).count().reset_index()
#data1.head()

In [None]:
#Figure of comparison per branch
plt.figure(figsize=(20,6))
sns.barplot(data=data1,x="Entity",y="Tweet ID",hue='Sentiment')
plt.xticks(rotation=90)
plt.xlabel("Brand")
plt.ylabel("Number of tweets")
plt.grid()
plt.title("Distribution of tweets per Branch and Type")
plt.show()

#### - Data Cleaning / Preprocessing for Medeling

In [None]:
# Entity Recognition
Entities_t = set(twitter_training['Entity'])
Entities_v = set(twitter_validation['Entity'])
Entities_t = list(Entities_t)
Entities_v = list(Entities_t)
EntitiesLowered = [item.lower() for item in Entities_t]

#See if entiites in both datasets are the same:
print(Entities_t)
print(len(Entities_t))
#print(Entities_v)
#print(len(Entities_v))

#Creating our entity dictionary:
entity_dict = { 
                "RedDeadRedemption(RDR)" : ['rdr', 'red dead redemption', 'red dead'], 
                "Microsoft": ['microsoft'],
                "Xbox(XSeries)":['xbox', 'series x', 'series s', 'xbox one', 'xseries'], 
                "AssassinsCreed": ['assassinscreed', 'assassins creed'], 
                "CallOfDutyBlackopsColdWar": ['black ops', 'cold war', 'callOfdutyblackopscoldWar'],
                "FIFA": ['fifa'],
                "TomClancysGhostRecon": ['ghost recon', 'ghostrecon'],
                "Google": ['google'],
                "PlayStation(PS)": ['ps5', 'playstation', 'ps4', 'PS'],
                "Facebook": ['facebook'],
                "GrandTheftAuto(GTA)": ['gta', 'grand theft auto'],
                "PlayerUnknownsBattlegrounds(PUBG)": ['pubg', 'player unknowns battlegrounds', 'PlayerUnknownsBattlegrounds'],
                "Hearthstone": ['hearthstone'],
                "MaddenNFL": ['madden'],
                "CallOfDuty": ['modern warfare', 'call of duty', 'cod'],
                "Fortnite": ['fortnitegame', 'fortnite'],
                "Verizon": ['verizon'],
                "Nvidia": ['nvidia'],
                "Amazon": ['amazon'],
                "WorldOfCraft": ['wow', 'world of warcraft'],
                "ApexLegends": ['apex legends', 'apex', 'apexlegends'],
                "CS-GO": ['csgo', 'counter strike'],
                "johnson&johnson": ['johnson&johnson', 'johnson & johnson'],
                "HomeDepot": ['homedepot', 'home depot'],
                "NBA2K": ['nba'],
                "Overwatch": ['overwatch'],
                "LeagueOfLegends": ['lol', 'league of legends'],
                "Borderlands": ['borderlands'],
                "TomClancysRainbowSix": ['rainbow six', 'rainbow six siege', 'rainbowsix'],
                "Dota": ['dota'],
                "Battlefield": ['battlefield'],
                "Cyberpunk2077": ['cyberpunkgame', 'cyberpunk2077', 'cyberpunk'],
                "NintendoSwitch": ['nintendo switch', 'nintendo'],
                "Windows": ['windows', 'window']
              }

In [None]:
#By each entry:
#Step1: Find Entities in sentence and replace them from Entitylist
#Step2: Lower case every- word in every entry

numbers_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
#comment = "on Borderlands 1 I will murder all on Xbox the console of microsoft, is like RDR or Assassin's-creed..."
comment = twitter_training["Tweet_content"][292]

#create a bag of words: entity_dic = { "RedDeadRedemption(RDR)" : [rdr, RDR, Red Dead]}
#if word in entity_dic["Red Daed Redemption"]:

# Note1: Some ENTITIES are not found like: xbox, PS5, Red Dead Redemption or rdr, this migth cause loss of accuracy. (Check: )
# - This doesn't work -
#comment=comment.upper()
#comment=comment.title()
#comment = string.capwords(comment)
# Note2: What about diferent games of the same saga like: COD black ops/modern warfare/cold war... what should we do?  (Check: )
# Note3: Hardcoding the entities dictionary?
print(comment)

In [None]:
comment = re.sub(r"[^a-zA-Z4-5 ]", "", comment) #Remove apostrophes, comas, ... 
print(comment)

In [None]:
comment = ' '.join([word for word in comment.split() if word not in numbers_list]) #Remove whitespaces
comment

In [None]:
#Checking if string exists in dictionary value

example='I love cod black ops'

def find_entities_list(text):
    text = re.sub(r"[^a-zA-Z ]", "", text)
    text = ' '.join([word for word in text.split()])
    text = text.lower()
    #print(text)
    for entity_names in entity_dict:
        for game_names in entity_dict[entity_names]:
            if game_names.lower() in text:
                #print(game_names)
                text = text.replace(game_names, entity_names)
                #print(text)
    return text

print(find_entities_list(comment))

In [None]:
twitter_training["Tweet_content"][24432]

In [None]:
print(find_entities_list(twitter_training["Tweet_content"][71659]))

In [None]:
twitter_training['Tweet_content'] = [find_entities_list(word) for word in twitter_training['Tweet_content']]
twitter_training['Tweet_content']

#### - Tokenization/Bag of words

In [None]:
#twitter_training['Tweet_content'] = [word.lower() for word in twitter_training['Tweet_content']]
#twitter_training['Tweet_content']

#Tokenize every entry in the data set
tokenized_twitter_training = [word_tokenize(word) for word in twitter_training['Tweet_content']]

#Transforming into a single list
tokenized_twitter_training = list(map(str, it.chain.from_iterable(tokenized_twitter_training)))
#tokenized_twitter_training[:25]

In [None]:
# Removing stopwords, punctuation and numbers
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)
stopwords_list += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
stopwords_list += ['....','...', '..', '’', "''", '``', '-',"([a-zA-Z]+(?:'[a-z]+)?)"]
tokenized_twitter_training_stopped = [word for word in tokenized_twitter_training if word not in stopwords_list]
#tokenized_twitter_training_stopped[:25]

In [None]:
# Frequency Distribution to see the number of times each word is used in each entry.
tokenized_twitter_training_freqdist = FreqDist(tokenized_twitter_training_stopped)
tokenized_twitter_training_freqdist.most_common(25)

#### - Stemming and Lemmatization

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

tokenized_twitter_training_lemmatized = [lemmatizer.lemmatize(word) for word in tokenized_twitter_training_stopped]
tokenized_twitter_training_lemmatized[:10]
#lemmatizer.lemmatize('drank')

In [None]:
ps = PorterStemmer()

tokenized_twitter_training_stemmed = [ps.stem(word) for word in tokenized_twitter_training_stopped]
tokenized_twitter_training_stemmed[:10]

#words = ["play", "playing", "played", "player"]
 
#for w in words:
#    print(w, " : ", ps.stem(w))

In [None]:
len(tokenized_twitter_training_freqdist)

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
tokenized_twitter_training_stopped_finder = BigramCollocationFinder.from_words(tokenized_twitter_training_stopped)
tokenized_twitter_training_stopped_scored = tokenized_twitter_training_stopped_finder.score_ngrams(bigram_measures.raw_freq)
# Display the first 10 elements of macbeth_scored
tokenized_twitter_training_stopped_scored[:35]

#### - Train/Test Splits

In [None]:
# Data Cleaning
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

#remove outliers
twitter_training = remove_outlier(twitter_training,'text_lens')

In [None]:
# Tokenazation and Lemmatization
nlp=spacy.load('en_core_web_sm')

def preprocess(text):
    doc = nlp(text)
    filtered_tokens=[]
    for token in doc:
        if not token.is_stop and not token.is_punct:
            filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)
    

twitter_training['preprocessed_text']=twitter_training['Tweet_content'].apply(lambda x: preprocess(x))
twitter_validation['preprocessed_text']=twitter_validation['Tweet_content'].apply(lambda x: preprocess(x))

In [None]:
list1=twitter_training['Tweet_content'].apply(lambda x: preprocess(x))


In [None]:
from sklearn.model_selection import train_test_split

X_train ,X_test , y_train, y_test = train_test_split(
    twitter_training[['preprocessed_text']],
    twitter_training[['Sentiment']],
    test_size=0.2,
    random_state=42
)