In [207]:
#import the packages
import pandas as pd
import numpy as np
import nltk as nlp

In [208]:
#load the csv file
df = pd.read_csv("ExtractedTweets.csv")
#drop the missing values row wise
df.dropna(axis = 0, inplace = True)
df.head()

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...


In [209]:
# Data Types
df.dtypes

Party     object
Handle    object
Tweet     object
dtype: object

In [210]:
#assigning Decomart label and Republican label with 1 and 0 respectively
df["Party_log"] = [1 if each == "Democrat" else 0 for each in df.Party]
print(df.shape)
df

(86460, 4)


Unnamed: 0,Party,Handle,Tweet,Party_log
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",1
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,1
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,1
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,1
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,1
...,...,...,...,...
86455,Republican,RepTomPrice,Check out my op-ed on need for End Executive O...,0
86456,Republican,RepTomPrice,"Yesterday, Betty &amp; I had a great time lear...",0
86457,Republican,RepTomPrice,We are forever grateful for the service and sa...,0
86458,Republican,RepTomPrice,Happy first day of school @CobbSchools! #CobbB...,0


In [211]:
#First tweet, trying to understand the tweets in the dataset
df.Tweet[0]

'Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L'

In [212]:
df.Tweet[1]

'RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…'

In [213]:
df.Tweet[86459]

'#Zika fears realized in Florida. House GOP acted to prevent crisis. Dems inaction, inexcusable! Time to put politics aside &amp; work together!'

In [214]:
#Count for each label
df.Party.value_counts()

Republican    44392
Democrat      42068
Name: Party, dtype: int64

## Data Preprocessing


In [215]:
#importing the packages necessary for Data Preprocessing
#Regular expression
import re
#For string operations
import string
#Package for Natural language processing
import nltk
#For lemmatization
from nltk.stem.wordnet import WordNetLemmatizer
#To remove the stop words
from wordcloud import STOPWORDS

In [216]:
#counting those words which we don't need 
#initializaing each count to 0
url_count = 0
punc_count = 0
number_count = 0
mention_count = 0
other_than_character_count = 0

 
for i in range(len(df.Tweet)):#run the for loop till the end of tweet
    
  url_re = re.findall('http\S+',df.Tweet[i])
  url_count += len(url_re)
  punc_re = re.findall('[%s]',df.Tweet[i]) 
  punc_count += len(punc_re)
  num_re = re.findall('(\d+)', df.Tweet[i])
  number_count += len(num_re)
  mention_re = re.findall('@(\w+)', df.Tweet[i])
  mention_count += len(mention_re)
  alpha_re = re.findall("[^a-zA-Z]", df.Tweet[i])
  other_than_character_count += len(alpha_re)

print ("Count of url in tweet:", url_count)
print ("Count of punctuation in tweet:",punc_count)
print ("Count of numbers in tweet:",number_count)
print ("Count of mentions in tweet:",mention_count)
print ("Count of alphanum in tweet:",other_than_character_count)

Count of url in tweet: 68405
Count of punctuation in tweet: 537631
Count of numbers in tweet: 128691
Count of mentions in tweet: 64786
Count of alphanum in tweet: 2465479


In [217]:
def round1(data_str): 
    url_re = re.compile('http\S+') 
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\d+)')
    alpha_re = re.compile("[^a-zA-Z]")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove punctuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    data_str = alpha_re.sub(' ', data_str)
    return data_str

In [218]:
#Performing lemmatization
def lemmatize(data_str):
    #initial index position for string 0
    list_pos = 0
    #intialize empty string to store the lemma words
    cleaned_str = ''
    #tokenize the sentence
    data_str = nltk.word_tokenize(data_str)
    lemma = nlp.WordNetLemmatizer()
    #for each word in sentence run a for loop and find the lemma word and add it to cleaned_str
    for word in data_str: 
        lem = lemma.lemmatize(word) 
        if list_pos == 0: 
            cleaned_str = lem 
        else:
            cleaned_str = cleaned_str + ' ' + lem
        list_pos += 1     
    return cleaned_str
    

In [219]:
#adding some more words to stopwords which was found during EDA process
STOPWORDS.add("rt")
STOPWORDS.add("s")
STOPWORDS.add("u")
STOPWORDS.add("amp")
STOPWORDS.add("th")
STOPWORDS.add("will")
STOPWORDS.add("t")
STOPWORDS.add("m")
STOPWORDS.add("ha")
STOPWORDS.add("wa")

In [220]:
# Removing stop words
def remove_stops(data_str):
    list_pos = 0
    cleaned_str = ''
    text = data_str.split()
    for word in text:
        #if word is not present in stopword add that to cleaned_str
        if word not in STOPWORDS:
            if list_pos == 0:
                cleaned_str = word
            else:
                cleaned_str = cleaned_str + ' ' + word
            list_pos += 1
    return cleaned_str


In [221]:
#Cleaning the data 
#empty list
data_clean = []
for i in range(len(df.Tweet)):
    #removing puntuation, links, numbers
    res = round1(df.Tweet[i])
    #removing the stopwords
    res1 = remove_stops(res)
    #performing lemmatization
    res2 = lemmatize(res1)
    data_clean.append(res2)

In [222]:
#cleaned Tweets
data_clean

['today senate dems vote savetheinternet proud support similar netneutrality legislation house',
 'winterhavensun winter haven resident alta vista teacher one several recognized repdarrensoto national teacher apprecia',
 'nbclatino repdarrensoto noted hurricane maria left approximately billion damage congress allocated',
 'nalcabpolicy meeting repdarrensoto thanks taking time meet latinoleader ed marucci guzman nalcabpolicy',
 'vegalteno hurricane season start june st puerto rico readiness well pwr puertorico repdarrensoto espaillatny',
 'emgageactionfl thank came orlando gala successful night possible without',
 'hurricane maria left approx billion damage yet billion allocated rebuilding grid surpr',
 'tharryry delighted repdarrensoto voting cra overrule fcc save netneutrality rule find',
 'hispaniccaucus trump anti immigrant policy hurting small business across country find american willing',
 'repstephmurphy great joining weareunidosus repdarrensoto roundtable orlando federal issue 

In [223]:
#add the cleaned data to the dataframe
df['clean_tweet']=data_clean

In [224]:
df.clean_tweet[0]

'today senate dems vote savetheinternet proud support similar netneutrality legislation house'

In [225]:
#dataframe with the party_log and clean_tweet
df

Unnamed: 0,Party,Handle,Tweet,Party_log,clean_tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",1,today senate dems vote savetheinternet proud s...
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,1,winterhavensun winter haven resident alta vist...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,1,nbclatino repdarrensoto noted hurricane maria ...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,1,nalcabpolicy meeting repdarrensoto thanks taki...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,1,vegalteno hurricane season start june st puert...
...,...,...,...,...,...
86455,Republican,RepTomPrice,Check out my op-ed on need for End Executive O...,0,check op ed need end executive overreach act w...
86456,Republican,RepTomPrice,"Yesterday, Betty &amp; I had a great time lear...",0,yesterday betty great time learning forestry i...
86457,Republican,RepTomPrice,We are forever grateful for the service and sa...,0,forever grateful service sacrifice major barney
86458,Republican,RepTomPrice,Happy first day of school @CobbSchools! #CobbB...,0,happy first day school cobbschools cobbbacktos...


In [226]:
import pickle

In [227]:
#serialize object to file, which can be used later
df.to_pickle("corpus.pkl")