In [17]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [4]:
# Load training data
file_path = 'data/training-Obama-Romney-tweets.xlsx'

# Load Obama tweets
obama_df = pd.read_excel(file_path, sheet_name="Obama")

# Load Romney tweets
romney_df = pd.read_excel(file_path, sheet_name="Romney")

In [5]:
# Display the first few rows of Obama dataset to check the data
print("Obama Tweets:")
print(obama_df.head())
print("No of obama tweets : ", len(obama_df))

Obama Tweets:
   Unnamed: 0                 date            time  \
0         NaN  2012-10-16 00:00:00  10:28:53-05:00   
1         NaN  2016-12-10 00:00:00  10:09:00-05:00   
2         NaN  2012-10-16 00:00:00  10:04:30-05:00   
3         NaN  2012-10-16 00:00:00  10:00:36-05:00   
4         NaN  2012-10-16 00:00:00  09:50:08-05:00   

                                     Anootated tweet Class  Your class label  
0  Kirkpatrick, who wore a baseball cap embroider...     0               NaN  
1  Question: If <e>Romney</e> and <e>Obama</e> ha...     2               NaN  
2  #<e>obama</e> debates that Cracker Ass Cracker...     1               NaN  
3  RT @davewiner Slate: Blame <e>Obama</e> for fo...     2               NaN  
4  @Hollivan @hereistheanswer  Youre missing the ...     0               NaN  
No of obama tweets :  7198


In [6]:
# Display the first few rows of Romney dataset to check the data
print("\nRomney Tweets:")
print(romney_df.head())
print("No of Romaney tweets : ", len(romney_df))


Romney Tweets:
   Unnamed: 0                 date            time  \
0         NaN  2012-10-16 00:00:00  09:38:08-05:00   
1         NaN  2012-10-16 00:00:00  10:22:34-05:00   
2         NaN  2012-10-16 00:00:00  10:14:18-05:00   
3         NaN  2012-10-16 00:00:00  09:27:16-05:00   
4         NaN  2012-10-16 00:00:00  10:11:43-05:00   

                                     Anootated tweet Class  Your class label  
0  Insidious!<e>Mitt Romney</e>'s Bain Helped Phi...    -1               NaN  
1  Senior <e>Romney</e> Advisor Claims <e>Obama</...     2               NaN  
2  .@WardBrenda @shortwave8669 @allanbourdius you...    -1               NaN  
3  <e>Mitt Romney</e> still doesn't <a>believe</a...    -1               NaN  
4  <e>Romney</e>'s <a>tax plan</a> deserves a 2nd...    -1               NaN  
No of Romaney tweets :  7200


In [7]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dheerajkumarbuchala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dheerajkumarbuchala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dheerajkumarbuchala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
valid_classes = [-1, 0, 1]

In [9]:
# Pre-Processing the Obama DataFrame created
obama_df = obama_df[obama_df['Class'].isin(valid_classes)]
print("No of obama tweets after cleaning : ", len(obama_df))

No of obama tweets after cleaning :  5471


In [10]:
# Pre-Processing the Romney DataFrame created
romney_df = romney_df[romney_df['Class'].isin(valid_classes)]
print("No of romney tweets after cleaning : ", len(romney_df))

No of romney tweets after cleaning :  5648


In [11]:
obama_anootated_tweets = obama_df['Anootated tweet']
obama_assigned_class = obama_df['Class']

romney_anootated_tweets = romney_df['Anootated tweet']
romney_assigned_class = romney_df['Class']

In [12]:
obama_anootated_tweets = obama_anootated_tweets.tolist()
obama_assigned_class = obama_assigned_class.tolist()

romney_anootated_tweets = romney_anootated_tweets.tolist()
romney_assigned_class = romney_assigned_class.tolist()

In [18]:
def dataClean(tweets):
    cleaned_tweets = list()
    for tweet in tweets:
        tweet = str(tweet).encode('ascii', 'ignore').strip()
        tweet = str(tweet)
        textCleaner = re.compile('(</?[a-zA-Z]+>|https?:\/\/[^\s]*|(^|\s)RT(\s|$)|@[^\s]+|\d+)')
        cleanedTweet = re.sub(textCleaner, ' ', tweet)
        cleanedTweet = re.sub('(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)',' ',cleanedTweet)
        cleanedTweet = re.sub('[^\sa-zA-Z]+','',cleanedTweet)
        cleanedTweet = re.sub('\s+',' ',cleanedTweet)
        cleanedTweet = cleanedTweet[1:].strip()
        cleaned_tweets.append(cleanedTweet)
    return cleaned_tweets

In [19]:
obama_cleaned_tweets = dataClean(obama_anootated_tweets)
romney_cleaned_tweets = dataClean(romney_anootated_tweets)

In [20]:
print(obama_cleaned_tweets)



In [21]:
print(romney_cleaned_tweets)



In [22]:
def tokenizationAndLemmatization(tweets):
    cleaned_tweets = list()
    for tweet in tweets:
        tweet = tweet.lower()
        tokens = word_tokenize(tweet)
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
    
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
        # Rejoin tokens into a cleaned tweet
        cleaned_tweet = ' '.join(tokens)
    
        cleaned_tweets.append(cleaned_tweet)
    return cleaned_tweets

In [24]:
obama_cleaned_tweets = tokenizationAndLemmatization(obama_cleaned_tweets)
romney_cleaned_tweets = tokenizationAndLemmatization(romney_cleaned_tweets)

In [25]:
print(obama_cleaned_tweets)

