In [1]:
import csv
import re, nltk
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib



In [2]:
def normalizer(review):
    soup = BeautifulSoup(review, 'lxml')   # removing HTML encoding such as ‘&amp’,’&quot’
    souped = soup.get_text()
    only_words = re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)"," ", souped) # removing @mentions, hashtags, urls

    tokens = nltk.word_tokenize(only_words)
    removed_letters = [word for word in tokens if len(word)>2]
    lower_case = [l.lower() for l in removed_letters]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [3]:
def main():
    #### Loading the saved model
    model = joblib.load('svc.sav')
    vocabulary_model = pd.read_csv('vocabulary_SVC.csv', header=None)
    vocabulary_model_dict = {}
    for i, word in enumerate(vocabulary_model[0]):
         vocabulary_model_dict[word] = i
    tfidf = TfidfVectorizer(sublinear_tf=True, vocabulary = vocabulary_model_dict, min_df=5, norm='l2', ngram_range=(1,3)) # min_df=5 is clever way of feature engineering
    
    #### Reading retrieved tweets as dataframe
    tweet_df = pd.read_csv('test.csv', encoding = "ISO-8859-1")
    pd.set_option('display.max_colwidth', -1) # Setting this so we can see the full content of cells
    #### Normalizing retrieved tweets
    tweet_df['normalized_tweet'] = tweet_df.tweet.apply(normalizer)
    tweet_df = tweet_df[tweet_df['normalized_tweet'].map(len) > 0] # removing rows with normalized tweets of length 0
    print("Number of tweets remaining after cleaning: ", tweet_df.normalized_tweet.shape[0])
    print(tweet_df[['tweet','normalized_tweet']].head())
    #### Saving cleaned tweets to csv file
    tweet_df.drop(['tweet'], axis=1, inplace=True)
    tweet_df.to_csv('cleaned_tweet.csv', encoding='utf-8', index=False)
    cleaned_tweet = pd.read_csv("cleaned_tweet.csv", encoding = "ISO-8859-1")
    pd.set_option('display.max_colwidth', -1)
    cleaned_tweet_tfidf = tfidf.fit_transform(cleaned_tweet['normalized_tweet'])
    targets_pred = model.predict(cleaned_tweet_tfidf)
    #### Saving predicted sentiment of tweets to csv
    cleaned_tweet['label'] = targets_pred.reshape(-1,1)
    cleaned_tweet.drop(['normalized_tweet'], axis=1, inplace=True)
    cleaned_tweet.to_csv('predicted_sentiment.csv', encoding='utf-8', index=False)

In [4]:
#import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
if __name__ == "__main__":
    main()

  if sys.path[0] == '':


Number of tweets remaining after cleaning:  1953
                                                                                                                               tweet  \
0  I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks                                                       
1  currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/                 
2  I'd like to puts some CD-ROMS on my iPad, is that possible?' â Yes, but wouldn't that block the screen?\n                         
3  My ipod is officially dead. I lost all my pictures and videos from the 1D and 5sos concert,and from Vet Camp #hatinglife #sobbing   
4  Been fighting iTunes all night! I only want the music I $&@*# paid for                                                              

                                                                              normalized_tweet  
0  [hate, new, iphone, upgrade, let, 

