# Preprocessing political tweets data
---

# 1. Import required modules

## 1.1. To install all required libraries

In [1]:
# Uncomment below line to install all the required dependencies
# !pip install -r ../requirements.txt -q

## 1.2. Imports

In [2]:
import re
import itertools
import os

from IPython.display import clear_output

import numpy as np
import pandas as pd
import nltk

from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer

import seaborn as sns
import matplotlib.pyplot as plt

import emoji

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/varun487/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/varun487/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---

# 2. Creating the political tweets dataset

## 2.1. Show all csvs

In [3]:
# All csvs of tweets
os.listdir('../DataCollection/political_tweets_data/')

['rsprasad_tweets.csv',
 'myogiadityanath_tweets.csv',
 'OfficeofUT_tweets.csv',
 'rahulgandhi_tweets.csv',
 'CMOTamilNadu_tweets.csv',
 'ArvindKejriwal_tweets.csv',
 'smritiirani_tweets.csv',
 'PiyushGoyalOffc_tweets.csv',
 'AmitShah_tweets.csv',
 'nsitharaman_tweets.csv',
 'narendramodi_tweets.csv',
 'rajnathsingh_tweets.csv',
 'Dev_Fadnavis_tweets.csv',
 'mamataofficial_tweets.csv',
 'NitishKumar_tweets.csv',
 'nitin_gadkari_tweets.csv',
 'DrSJaishankar_tweets.csv']

## 2.2. Create a single Dataframe of all tweets

In [4]:
data = []

for csv in os.listdir('../DataCollection/political_tweets_data/'):
    df = pd.read_csv(f"../DataCollection/political_tweets_data/{csv}")
    #print("---------------------------------")
    #print()
    #print(df.head())
    data.append(df)

# print(data)
df = pd.concat(data)
df.head()

Unnamed: 0,mp,tweet_date,tweet_time,tweet_id,tweet_text
0,rsprasad,2021-03-31,23:30:00,1377401315102355458,"Please, don't break the trust.\n\n@narendramod..."
1,rsprasad,2021-03-31,23:30:00,1377400199425843200,"MAINTENANCE OF WIVES, CHILDREN AND PARENTS - h..."
2,rsprasad,2021-03-31,23:30:00,1377400148972642308,NIA look into SSR case \n\n@AmitShah @narendra...
3,rsprasad,2021-03-31,23:30:00,1377395982019620864,@PavanjitMane1 @PMOIndia @CMOMaharashtra @rspr...
4,rsprasad,2021-03-31,23:30:00,1377390183193014276,@rsprasad @narendramodi @PMOIndia @examwarrior...


In [5]:
df.tail()

Unnamed: 0,mp,tweet_date,tweet_time,tweet_id,tweet_text
2044,DrSJaishankar,2021-03-30,7:30:00,1376774949768032256,@RDXThinksThat @HQ_IDS_India @adgpi @IAF_MCC @...
2045,DrSJaishankar,2021-03-30,7:30:00,1376774904041836546,@ANI Dhame on u @ImranKhanPTI\nActually @Govto...
2046,DrSJaishankar,2021-03-30,7:30:00,1376774837880774660,"@swati_gs @DrSJaishankar Sir, Please ensure t..."
2047,DrSJaishankar,2021-03-30,7:30:00,1376774570854674433,@DrSJaishankar @vijai63 India is proud of our ...
2048,DrSJaishankar,2021-03-30,7:30:00,1376774475769843712,🔸As India-UAE relationship grow stronger day b...


## 2.3. Reset index

In [6]:
df.reset_index(inplace=True, drop=True)

In [7]:
df.head()

Unnamed: 0,mp,tweet_date,tweet_time,tweet_id,tweet_text
0,rsprasad,2021-03-31,23:30:00,1377401315102355458,"Please, don't break the trust.\n\n@narendramod..."
1,rsprasad,2021-03-31,23:30:00,1377400199425843200,"MAINTENANCE OF WIVES, CHILDREN AND PARENTS - h..."
2,rsprasad,2021-03-31,23:30:00,1377400148972642308,NIA look into SSR case \n\n@AmitShah @narendra...
3,rsprasad,2021-03-31,23:30:00,1377395982019620864,@PavanjitMane1 @PMOIndia @CMOMaharashtra @rspr...
4,rsprasad,2021-03-31,23:30:00,1377390183193014276,@rsprasad @narendramodi @PMOIndia @examwarrior...


In [8]:
df.tail()

Unnamed: 0,mp,tweet_date,tweet_time,tweet_id,tweet_text
34553,DrSJaishankar,2021-03-30,7:30:00,1376774949768032256,@RDXThinksThat @HQ_IDS_India @adgpi @IAF_MCC @...
34554,DrSJaishankar,2021-03-30,7:30:00,1376774904041836546,@ANI Dhame on u @ImranKhanPTI\nActually @Govto...
34555,DrSJaishankar,2021-03-30,7:30:00,1376774837880774660,"@swati_gs @DrSJaishankar Sir, Please ensure t..."
34556,DrSJaishankar,2021-03-30,7:30:00,1376774570854674433,@DrSJaishankar @vijai63 India is proud of our ...
34557,DrSJaishankar,2021-03-30,7:30:00,1376774475769843712,🔸As India-UAE relationship grow stronger day b...


## 2.4. Remove tweet_id

In [9]:
df = df.drop(['tweet_id'], axis = 1)

In [10]:
df.head()

Unnamed: 0,mp,tweet_date,tweet_time,tweet_text
0,rsprasad,2021-03-31,23:30:00,"Please, don't break the trust.\n\n@narendramod..."
1,rsprasad,2021-03-31,23:30:00,"MAINTENANCE OF WIVES, CHILDREN AND PARENTS - h..."
2,rsprasad,2021-03-31,23:30:00,NIA look into SSR case \n\n@AmitShah @narendra...
3,rsprasad,2021-03-31,23:30:00,@PavanjitMane1 @PMOIndia @CMOMaharashtra @rspr...
4,rsprasad,2021-03-31,23:30:00,@rsprasad @narendramodi @PMOIndia @examwarrior...


In [11]:
df.tail()

Unnamed: 0,mp,tweet_date,tweet_time,tweet_text
34553,DrSJaishankar,2021-03-30,7:30:00,@RDXThinksThat @HQ_IDS_India @adgpi @IAF_MCC @...
34554,DrSJaishankar,2021-03-30,7:30:00,@ANI Dhame on u @ImranKhanPTI\nActually @Govto...
34555,DrSJaishankar,2021-03-30,7:30:00,"@swati_gs @DrSJaishankar Sir, Please ensure t..."
34556,DrSJaishankar,2021-03-30,7:30:00,@DrSJaishankar @vijai63 India is proud of our ...
34557,DrSJaishankar,2021-03-30,7:30:00,🔸As India-UAE relationship grow stronger day b...


## 2.5. All politicians whose data has been collected

In [12]:
for pol in df.mp.unique():
    print(pol)

rsprasad
myogiadityanath
OfficeofUT
rahulgandhi
CMOTamilNadu
ArvindKejriwal
smritiirani
PiyushGoyalOffc
AmitShah
nsitharaman
narendramodi
rajnathsingh
Dev_Fadnavis
mamataofficial
NitishKumar
nitin_gadkari
DrSJaishankar


---

# 3. Clean tweets

- Credits for the emoticons and contractions dictionaries
    - https://towardsdatascience.com/twitter-sentiment-analysis-using-fasttext-9ccd04465597

## 3.1. Defining emoticons and contractions to replace

In [13]:
emoticons =  {
        ":‑)":"smiley",
        ":-]":"smiley",
        ":-3":"smiley",
        ":->":"smiley",
        "8-)":"smiley",
        ":-}":"smiley",
        ":)":"smiley",
        ":]":"smiley",
        ":3":"smiley",
        ":>":"smiley",
        "8)":"smiley",
        ":}":"smiley",
        ":o)":"smiley",
        ":c)":"smiley",
        ":^)":"smiley",
        "=]":"smiley",
        "=)":"smiley",
        ":-))":"smiley",
        ":‑D":"smiley",
        "8‑D":"smiley",
        "x‑D":"smiley",
        "X‑D":"smiley",
        ":D":"smiley",
        "8D":"smiley",
        "xD":"smiley",
        "XD":"smiley",
        ":‑(":"sad",
        ":‑c":"sad",
        ":‑<":"sad",
        ":‑[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'‑(":"sad",
        ":'(":"sad",
        ":‑P":"playful",
        "X‑P":"playful",
        "x‑p":"playful",
        ":‑p":"playful",
        ":‑Þ":"playful",
        ":‑þ":"playful",
        ":‑b":"playful",
        ":P":"playful",
        "XP":"playful",
        "xp":"playful",
        ":p":"playful",
        ":Þ":"playful",
        ":þ":"playful",
        ":b":"playful",
        "<3":"love"
        }

In [14]:
contractions = {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "i'm": "I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks"
        }

---

# 3.2. Load stop words

In [15]:
# stop words
stop = set(stopwords.words("english"))
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

## 3.3. Function to clean tweets

In [16]:
def clean_tweet(tweet):
    
    # Remove @ mentions
    cleaned_tweet = re.sub(r'@[A-Za-z0-9_]+', '', tweet)
    
    # Remove hashtags
    cleaned_tweet = re.sub(r'#[A-Za-z0-9_]+', '', cleaned_tweet)
    
    # Remove hyperlinks
    cleaned_tweet = re.sub(r'https?://[A-Za-z0-9_./?]+', '', cleaned_tweet)
    
    # Replace Contractions
    cleaned_tweet = cleaned_tweet.replace("’","'")
    cleaned_tweet = " ".join([contractions[word] if word in contractions else word for word in cleaned_tweet.split()])
    
    # Replace emoticons
    cleaned_tweet = cleaned_tweet.replace("’","'")
    cleaned_tweet = " ".join([emoticons[word] if word in emoticons else word for word in cleaned_tweet.split()])
    
    # Replace emojis
    cleaned_tweet = emoji.demojize(cleaned_tweet)
    cleaned_tweet = cleaned_tweet.replace(":"," ")
    cleaned_tweet = ' '.join(cleaned_tweet.split())
    
    # Fix misspelled words
    cleaned_tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(cleaned_tweet))
    
    # Remove special characters
    cleaned_tweet = re.sub(r'&[a-zA-Z0-9_]+;', '', cleaned_tweet)
    
    # Remove new line
    cleaned_tweet = re.sub(r'\n', '', cleaned_tweet)
    
    # Remove tab
    cleaned_tweet = re.sub(r'\t', '', cleaned_tweet)
    
    # Remove html
    cleaned_tweet = re.sub(r'<.*?>', '', cleaned_tweet)
    
    # Remove punctuations
    cleaned_tweet = re.sub(r'[-,:;.+?<>()!%=_*&^$/]', ' ', cleaned_tweet)
    
    # Remove apostrphe
    cleaned_tweet = re.sub(r'[\']', '', cleaned_tweet)
    
    # Remove numbers
    cleaned_tweet = re.sub(r'[0-9]+', '', cleaned_tweet)
    
    # Remove all single characters
    cleaned_tweet = re.sub(r'\s+[a-zA-Z]\s+', ' ', cleaned_tweet)
    
    # Substituting multiple spaces with single space
    cleaned_tweet= re.sub(r'\s+', ' ', cleaned_tweet, flags=re.I)
 
    # Removing prefixed 'b'
    cleaned_tweet = re.sub(r'^b\s+', '', cleaned_tweet)
 
    # Converting to Lowercase
    cleaned_tweet = cleaned_tweet.lower()
    
    # Remove stop words
    cleaned_tweet = " ".join([word for word in cleaned_tweet.split() if word not in stop])
    
    return cleaned_tweet


In [17]:
cleaned_tweets = []
count = 0

for tweet in df['tweet_text']:
#     print("Tweet:", tweet)
#     print()
#     print("Cleaned Tweet:", clean_tweet(tweet))
#     print()
#     print()
    cleaned_tweets.append(clean_tweet(tweet))
    
    count += 1
    if count % 1000 == 0:
        clear_output(wait=True)
        print(f"Completed cleaning {count}/34557 tweets.")
    
df['cleaned_tweet'] = cleaned_tweets

Completed cleaning 34000/34557 tweets.


In [18]:
df.head()

Unnamed: 0,mp,tweet_date,tweet_time,tweet_text,cleaned_tweet
0,rsprasad,2021-03-31,23:30:00,"Please, don't break the trust.\n\n@narendramod...",please break trust
1,rsprasad,2021-03-31,23:30:00,"MAINTENANCE OF WIVES, CHILDREN AND PARENTS - h...",maintenance wives children parents l c h r read
2,rsprasad,2021-03-31,23:30:00,NIA look into SSR case \n\n@AmitShah @narendra...,nia look ssr case
3,rsprasad,2021-03-31,23:30:00,@PavanjitMane1 @PMOIndia @CMOMaharashtra @rspr...,new pan allotted per aadhar many pans issued b...
4,rsprasad,2021-03-31,23:30:00,@rsprasad @narendramodi @PMOIndia @examwarrior...,one worst govt till interest reduced income ta...


In [19]:
df.tail()

Unnamed: 0,mp,tweet_date,tweet_time,tweet_text,cleaned_tweet
34553,DrSJaishankar,2021-03-30,7:30:00,@RDXThinksThat @HQ_IDS_India @adgpi @IAF_MCC @...,happy holi sir green heart
34554,DrSJaishankar,2021-03-30,7:30:00,@ANI Dhame on u @ImranKhanPTI\nActually @Govto...,dhame actually renamed nauseated face nauseate...
34555,DrSJaishankar,2021-03-30,7:30:00,"@swati_gs @DrSJaishankar Sir, Please ensure t...",sir please ensure bangladesh foriegn ministry ...
34556,DrSJaishankar,2021-03-30,7:30:00,@DrSJaishankar @vijai63 India is proud of our ...,india proud pm shri narendra modi ji hearty co...
34557,DrSJaishankar,2021-03-30,7:30:00,🔸As India-UAE relationship grow stronger day b...,small orange diamond india uae relationship gr...


---

# 4. Snowball Stemmer on cleaned tweets

In [20]:
stemmed_tweets = []
ss = SnowballStemmer("english")

count = 0

for tweet in df['cleaned_tweet']:
    words = word_tokenize(tweet)
    temp = []
    for w in words:
        root = ss.stem(w)
        temp.append(root)
    temp =' '.join(temp)
    stemmed_tweets.append(temp)
    
    count += 1
    if count % 1000 == 0:
        clear_output(wait=True)
        print(f"Completed Stemming {count}/34557 tweets.")
    
df['Snowball_Stem'] = stemmed_tweets

Completed Stemming 34000/34557 tweets.


In [21]:
df.head()

Unnamed: 0,mp,tweet_date,tweet_time,tweet_text,cleaned_tweet,Snowball_Stem
0,rsprasad,2021-03-31,23:30:00,"Please, don't break the trust.\n\n@narendramod...",please break trust,pleas break trust
1,rsprasad,2021-03-31,23:30:00,"MAINTENANCE OF WIVES, CHILDREN AND PARENTS - h...",maintenance wives children parents l c h r read,mainten wive children parent l c h r read
2,rsprasad,2021-03-31,23:30:00,NIA look into SSR case \n\n@AmitShah @narendra...,nia look ssr case,nia look ssr case
3,rsprasad,2021-03-31,23:30:00,@PavanjitMane1 @PMOIndia @CMOMaharashtra @rspr...,new pan allotted per aadhar many pans issued b...,new pan allot per aadhar mani pan issu bef aad...
4,rsprasad,2021-03-31,23:30:00,@rsprasad @narendramodi @PMOIndia @examwarrior...,one worst govt till interest reduced income ta...,one worst govt till interest reduc incom tax c...


# 8. Preprocessed data to csv

In [22]:
df.to_csv("./data/cleaned_political_tweets.csv", index = False)

---