In [None]:
## Author: Dejun Xiang
## ID: 349329
## Project: Donald Trump analytics
## Supervisor: Prof. Richard O. Sinnott
## Pre-process tweet (English)

In [None]:
import re
import random
import nltk
from textblob import TextBlob
import numpy as np
import pandas as pd
import matplotlib as plt
from nltk.corpus import stopwords

In [None]:
# read the raw data
raw_data = pd.read_csv(r"C:\Users\Derek\Desktop\twitter\data\D-noDulplicate.csv")

In [None]:
# download the important package
nltk.download()

In [None]:
# convert the type of dates from object to datetime64
raw_data['dates'] = pd.to_datetime(raw_data['dates'])

In [None]:
# check the result
raw_data.dtypes

In [None]:
# check how many cells in likes columns is empty
raw_data.likes.isnull().sum()

In [None]:
# Sampling from the raw_data
sampleD = sampler(500,raw_data)
sampleD.tweet.iloc[55]
sampleD.to_csv(r"C:\Users\Derek\Desktop\twitter\sampleD.csv")

In [None]:
# because there are just a few empty, so we can just drop them
raw_data_notnull = raw_data[raw_data['likes'].notnull()].copy()
# fix the index to from 0 to the end
raw_data_notnull.index = range(len(raw_data_notnull))

# convert the data type of number of likes from object to integer
for i in range(len(raw_data_notnull)):
    s = raw_data_notnull.loc[i,'likes']
    raw_data_notnull.loc[i,'likes'] = str_to_int(s)

In [None]:
raw_data_notnull.head(3)

In [None]:
# remove the reply phrase (eg. @Donald Trump)
rm_reply_symbol(raw_data_notnull)

In [None]:
# clean up the tweet to single words combining
raw_data_notnull_copy = raw_data_notnull.copy()
for i in range(len(raw_data_notnull)):
    tweet = raw_data_notnull_copy.tweet.loc[i]
    clean_t = clean_tweet(tweet)
    raw_data_notnull.tweet.loc[i] = clean_t

In [None]:
raw_data_notnull.head(3)

In [None]:
data_copy1 = raw_data_notnull.copy()
# Classify tweets into two polary: positive & negtive
pol_class = []
for i in range(len(raw_data_notnull)):
    tweet = data_copy1.loc[i,'tweet']
    tb = TextBlob(tweet)
    p = tb.sentiment.polarity
    if p > 0.1:
        result = 1
    elif p >= -0.1 and p <= 0.1:
        result = 0
    else:
        result = -1
    pol_class.append(result)
    if i % 1000 == 0:
        print(i)

In [None]:
raw_data_notnull['sentiment'] = np.array(pol_class)

In [None]:
raw_data_notnull.groupby('dates').mean().head()

In [None]:
# Draw the graph of sentiments VS number of tweets post per day
%matplotlib notebook
d2 = raw_data_notnull.groupby('dates').sentiment.mean()
# d2.plot()
myplot = d2.plot(kind='line')
myplot.set_xlabel('Month in 2018')
myplot.set_ylabel('Average twitter posted/day')
myplot.set_title('Twitter Sentiment vary through months for English-Users')

In [None]:
'''
Function: convert string number to int (eg. 1.2K --> 1200, 45-->45)
Input: numbers(string)
Output: int number
'''
def str_to_int(s):
    regexp = re.compile(r'K')
    if regexp.search(s):
        pattern = r'\d*\.?\d'
        m =  re.search(pattern,s)
        m1 = m.group()
        n = float(m1)
        return int(n*1000)
    else:
        return int(s)

In [None]:
'''
Function: remove symbols, stopword;tokenization
Input:    raw tweet(string)
Output:   cleaned tweet
'''
def clean_tweet(tweet):
    # remove the symbols that are not English
    letters_only = re.sub("[^a-zA-Z]"," ",tweet)
    # covert it all to lower case
    lower_case = letters_only.lower()
    # split them into saperate words
    words = lower_case.split()
    # get the stopwords and stored in a dictionanry(faster)
    stops = set(stopwords.words("english"))
    # remove stop words
    words = [w for w in words if not w in stopwords.words("english")]
    # re-combine the cleaned and useful words to return
    return (" ".join(words))


'''
Function: remove reply expression (eg. @DonaldTrump)
Input:    raw dataset (dataframe)
Output:   cleaned dataset (dataframe)
'''
def rm_reply_symbol(dataset):
    data_copy = dataset.copy()
    pattern = re.compile('@\S*[\s.,\/#!$%\^&\*;:{}=\-_`~()]')
    length = len(dataset)
    for i in range(0,length):
        text = data_copy.loc[i,"tweet"]
        dataset.loc[i,"tweet"] = pattern.sub('',text)

In [None]:
'''
Function: classify the tweet to positive 1, neutral 0, negtive -1
          and return the list of the result
Input:    raw dataset (dataframe)
Output:   result of classification (list)
'''
def get_sentiments(dataset):
    data_copy = dataset.copy()
    # store [classification]
    pol_class = []
    for i in range(len(dataset)):
        tweet = data_copy.loc[i,'tweet']
        tb = TextBlob(tweet)
        p = tb.sentiment.polarity
        if p > 0.1:
            result = 1
        elif p >= -0.1 and p<= 0.1:
            result = 0
        else:
            result = -1
        pol_class.append(result)
        if i % 10 == 0:
            print(i)
        
    return pol_class

In [None]:
'''
Function: reservior sampler
          sampling randomly and uniformly(can be proved by math)
          --> find the random index first, then get the rows
Input: k (int) --> the number of samples needed
       dataset (dataframe)--> the dataframe sampled from
Output: sampled dataframe
'''
def sampler(k,dataset):
    index = []
    num_rows = len(dataset)
    if k > num_rows or k <= 0:
        return "sampling size 'k' is not valid, try other 'k'"
    for i in range(0,num_rows):
        if i < k:
            index.append(i)
        else:
            random_index = random.randint(0,i)
            if random_index < k:
                index[random_index] = i
    return dataset.iloc[index]