In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import glob

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords as stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

from ast import literal_eval
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.metrics.scores import (precision, recall, f_measure)
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC
import emoji

# Full Pandas pipeline that can be used on Search/Stream Twitter API data

In [2]:
#load out best performing algorithm
clf = pickle.load(open(r'C:\Users\AdamShafi\Twitter Sentiment Analysis\Pickled Algos\LogReg_clf.sav', "rb"))

In [3]:
#load the lexicon
with open('all_words.pkl', 'rb') as f:
    all_words = pickle.load(f)

In [17]:
def preprocess_text(df):
    df = df.dropna(subset=['text']) #remove blank tweets
    df = df[df.text.str.contains('\$')] #filter to only cashtags or money
    df['text'] = df['text'].str.replace('&amp;', '&', case=False)
    df = df[~df.text.str.contains('^RT', regex=True)]  #Remove retweets ### ~ tilde is INVERT - does the opposite
    return df

def extract_emojis(df):
    abc = df['text']
    df['emoji'] = ''.join(c for c in abc if c in emoji.UNICODE_EMOJI)
    return df

def extract_cashtags(df):
    a=[]
    cashtagsrx = re.compile(r'\$[A-Z]{2,}')
    for i in df['text']:
        a.append(cashtagsrx.findall(str(i)))
    df['Cashtags'] = a
    return df

def extract_hashtags(df):
    hashtag_list = []
    for row in df['hashtags']:
        row = literal_eval(row)
        x = [x['text'] for x in row if 'text' in x]  
        hashtag_list.append(x)
    df['hashtag_list']=hashtag_list
    return df

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in range(len(r)):
        r[i] = '\\' + r[i]
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

def clean_up(df):
    df['clean_tweet'] = np.vectorize(remove_pattern)(df['text'], r'\$[A-Z]{2,}') #remove cashtags
    df['clean_tweet'] = df['clean_tweet'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True) #remove urls
    df['clean_tweet'] = df['clean_tweet'].map(lambda x: x.lower()) #lowercase tweet
    df['clean_tweet'] = df['clean_tweet'].replace('@[^\s]+', '', regex=True) #remove @name
    df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z]", " ") #remove punctuation
    df['clean_tweet'] = df['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) #remove short words
    return df

def tokenize(df):
    df['clean_tweet'] = df['clean_tweet'].apply(lambda x: word_tokenize(x))
    return df
    
def remove_stopwords(tokenized_list):
    stopword = stopwords.words('english')
    text = [word for word in tokenized_list if word not in stopword]
    return text

def apply_remove_stopwords(df):
    df['clean_tweet'] = df['clean_tweet'].apply(lambda x: remove_stopwords(x))
    df = df.dropna(subset=['clean_tweet'])
    return df

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in tokens:
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

def apply_lemmatize_sentence(df):
    df['clean_tweet'] = df['clean_tweet'].apply(lambda x: pos_tag(x))
    df['clean_tweet'] = df['clean_tweet'].apply(lambda x: lemmatize_sentence(x))
    return df

def all_tokens_for_model(tokens):
    return {word: (word in tokens) for word in all_words}

def apply_all_tokens_for_model(df):
    df['clean_tweet'] = df.apply(lambda x:  all_tokens_for_model(x.clean_tweet), axis=1) 
    return df

def probability_score(data): 
    dist = clf.prob_classify(data)
    Positive = dist.prob('Positive')
    Neutral = dist.prob('Neutral')
    Negative = dist.prob('Negative')
    return Positive, Neutral, Negative
    
def results(df):
    df['sentiment'] = df['clean_tweet'].apply(lambda x:  clf.classify(x))
    df['Positive'] = df['clean_tweet'].apply(lambda x:  probability_score(x)[0])
    df['Neutral'] = df['clean_tweet'].apply(lambda x:  probability_score(x)[1])
    df['Negative'] = df['clean_tweet'].apply(lambda x:  probability_score(x)[2])
    return df    
    

# Apply Pipeline

In [14]:
#load multiple csv files into pandas
path = r'C:\Users\AdamShafi\Twitter Sentiment Analysis\Tweets Data'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [15]:
len(df)

171485

In [18]:
#apply above functions to clean data and classify
df = (df.pipe(preprocess_text)
        .pipe(extract_hashtags)
        .pipe(extract_emojis)
        .pipe(extract_cashtags)
        .pipe(clean_up)
        .pipe(tokenize)
        .pipe(apply_remove_stopwords)
        .pipe(apply_lemmatize_sentence)
        .pipe(apply_all_tokens_for_model)
        .pipe(results)
     )

In [19]:
df = df.drop(['clean_tweet'], axis=1) #this field is very long and doesn't output well in Excel.

In [20]:
df.to_csv('results.csv', index=False)