In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
helper_path = "../code/"
sys.path.insert(0, helper_path)

In [3]:
from New_MissingValue import MissingValue

In [4]:
# relative path
data_path = "../data/"
data_file_list = ["train.csv","test.csv"]

In [5]:
# load data
data_train = pd.read_csv(data_path + data_file_list[0])
data_test = pd.read_csv(data_path + data_file_list[1])

# data info
print(f'Training text file: {data_train.shape[0]} rows; {data_train.shape[1]} columns')
print(f'Testing text file: {data_test.shape[0]} rows; {data_test.shape[1]} columns')
data_train.head()

Training text file: 27486 rows; 4 columns
Testing text file: 3535 rows; 3 columns


Unnamed: 0,textID,text,selected_text,sentiment
0,a3d0a7d5ad,Spent the entire morning in a meeting w/ a ven...,my boss was not happy w/ them. Lots of fun.,neutral
1,251b6a6766,Oh! Good idea about putting them on ice cream,Good,positive
2,c9e8d1ef1c,says good (or should i say bad?) afternoon! h...,says good (or should i say bad?) afternoon!,neutral
3,f14f087215,i dont think you can vote anymore! i tried,i dont think you can vote anymore!,negative
4,bf7473b12d,haha better drunken tweeting you mean?,better,positive


In [6]:
# check if NaNs exist in training data, and acquire their locations
missing_value_finder_train = MissingValue(data_train)
missing_value_finder_train.missing_value_summary(verbose=True)
missing_value_finder_train.missing_value_enumerator()

In column[91m text[00m , we have[91m 1[00m missing values.
In column[91m selected_text[00m , we have[91m 1[00m missing values.
textID           fdb77c3752
text                    NaN
selected_text           NaN
sentiment           neutral
Name: 13133, dtype: object


[13133]

In [7]:
# drop this missing row
for obj in missing_value_finder_train.na_index:
    data_train = data_train.drop([obj])

In [8]:
# check again
missing_value_finder_train = MissingValue(data_train)
missing_value_finder_train.missing_value_summary(verbose=True)

No missing value found!


({},
 0,
 textID           0
 text             0
 selected_text    0
 sentiment        0
 dtype: int64)

In [9]:
# check if NaNs exist in test data, and acquire their locations
missing_value_finder_test = MissingValue(data_test)
missing_value_finder_test.missing_value_summary(verbose=True)

No missing value found!


({},
 0,
 textID       0
 text         0
 sentiment    0
 dtype: int64)

Note:
- Missing value check & elimination finished 

# Data Classification

- Classify the data by their features, and save them into separate csv files, named "Positive.csv", "Negative.csv" and "Neutral.csv"

In [10]:
def SentiSeparation(data):
    Positive=data[data['sentiment']=='positive']
    Negative=data[data['sentiment']=='negative']
    Neutral=data[data['sentiment']=='neutral']
    Positive.to_csv(r'../data/Positive.csv')
    Negative.to_csv(r'../data/Negative.csv')
    Neutral.to_csv(r'../data/Neutral.csv')

In [11]:
SentiSeparation(data_train)

# Data Cleaning

Since the data set is too large, running through all the data may take too long. Therefore, only data with feature "positive" is tested at this time.

In [12]:
process_path = "../code/"
sys.path.insert(0, process_path)

In [13]:
from TextPreprocessing import TweetPreprocess
import nltk

In [14]:
Positive=pd.read_csv("../data/Positive.csv")
Positive=pd.DataFrame(Positive)
Positive.head()

Unnamed: 0.1,Unnamed: 0,textID,text,selected_text,sentiment
0,1,251b6a6766,Oh! Good idea about putting them on ice cream,Good,positive
1,4,bf7473b12d,haha better drunken tweeting you mean?,better,positive
2,6,2ab82634d5,had an awsome salad! I recommend getting the S...,had an awsome salad!,positive
3,7,a5a1c996c0,fine! Going to do my big walk today 20 or so ...,fine!,positive
4,8,a182b2638e,Thank a yoou how are you? #TwitterTakeover,Thank,positive


In [16]:
def process_to_csv(process_df, feature, clean_csv_path):
    """ Ensemble method for processing multiple tweets in dataframe (df)
    Params:
        process_df: the df to be processed
        feature: feature (colunm) of the df, use 'text' or 'selected_text'
        clean_csv_path: directory of written out csv file
    Return:
        saved_csv: csv file save to clean_csv_path
    """
    # copy the processed df from original df 
    processed_df = process_df.copy()
    processed_df_tbused=process_df.copy()
    
    for i, tweet in enumerate(process_df[feature]):
        if type(tweet) == str:
            processer = TweetPreprocess(tweet)
        else:
            raise Exception('The tweet must be str!')
            
        # call the processer class 
        clean_tweet_both = processer.process_tweet()
        clean_tweet=clean_tweet_both[0]
        clean_word=clean_tweet_both[1]
        processed_df[feature][i] = clean_tweet
        processed_df_tbused[feature][i]=clean_word
        
    # save as a csv file
    
    processed_df.to_csv(clean_csv_path)
    return processed_df_tbused


In [18]:
# saving path of the cleaned data
save='../data/Positive_clean_data.csv'
# featured data to be cleaned
feature='selected_text'
# apply the previous function
selected_text=process_to_csv(Positive,feature,save)
selected_text.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0.1,Unnamed: 0,textID,text,selected_text,sentiment
0,1,251b6a6766,Oh! Good idea about putting them on ice cream,[good],positive
1,4,bf7473b12d,haha better drunken tweeting you mean?,[better],positive
2,6,2ab82634d5,had an awsome salad! I recommend getting the S...,"[had, an, awsome, salad]",positive
3,7,a5a1c996c0,fine! Going to do my big walk today 20 or so ...,[fine],positive
4,8,a182b2638e,Thank a yoou how are you? #TwitterTakeover,[thank],positive


# Preparation for Statistical Analysis -- Unigram/Multi-gram

Cleaned data are used to do statistical analysis. Due to the large size of the data to be processed, only first 50 sets of data are tested to verify the code.

In [19]:
split_selected_text=selected_text['selected_text'][:50]
split_selected_word=[]
for i in range(len(split_selected_text)):
    for j in split_selected_text[i]:
        split_selected_word.append(j)

In [35]:
def nGram(split_selected_word,split_selected_text,N):
    if N==1:
        print('Unigram text framework')
        unigram=list(set(split_selected_word))
        unigram_frequency=[]
        unigram_presence=np.zeros(len(unigram))
        for o in range(len(unigram)):
            unigram_frequency.append(split_selected_word.count(unigram[o]))
            for l in range(len(split_selected_text)):
                if unigram[o] in split_selected_text[l]:
                    unigram_presence[o]+=1
        gram=unigram
        presence=unigram_presence
        frequency=unigram_frequency
    else:
        print(N,"-gram text frame work")
        ngram=[]
        for k in range(len(split_selected_text)):
            for m in range(0,len(split_selected_text[k])-(N-1)):
                string=split_selected_text[k][m]
                for g in range(1,N):
                    string=string+" "+split_selected_text[k][m+g]
                ngram.append(str(string))
        gram=list(set(ngram))
        ngram_frequency=[]
        ngram_presence='not applicable'
        for o in range(len(gram)):
            ngram_frequency.append(ngram.count(gram[o]))
        presence=ngram_presence
        frequency=ngram_frequency
    number_unique_term=len(gram)
    return gram,presence,frequency,number_unique_term
#for lis in range(len(split_selected_text)):
#    for ind in range(len(lis)):
#        if split_selected_text[lis][ind]

check if unigram works, N=1

In [42]:
uni_gram,uni_presence,uni_frequency,uni_number_unique_term=nGram(split_selected_word,split_selected_text,1)

Unigram text framework


In [32]:
comparison={'presence':uni_presence,'frequency':uni_frequency}
pd.DataFrame(comparison,index=uni_gram)

Unnamed: 0,presence,frequency
laptop,1.0,1
has,1.0,1
everyone,1.0,1
best,1.0,1
it,2.0,2
...,...,...
should,1.0,1
her,1.0,1
hope,1.0,1
that,2.0,2


check if multi-gram works, N=2

In [47]:
bi_gram,bi_presence,bi_frequency,bi_number_unique_term=nGram(split_selected_word,split_selected_text,2)

2 -gram text frame work


In [51]:
comparison={'frequency':bi_frequency}
pd.DataFrame(comparison,index=bi_gram)

Unnamed: 0,frequency
happy birthday,1
tuner for,1
be most,1
most welcome,1
day everyone,1
...,...
see them,1
of mine,1
got to,1
happy de,1


check if multi-gram works, N=4

In [52]:
tert_gram,tert_presence,tert_frequency,tert_number_unique_term=nGram(split_selected_word,split_selected_text,4)

4 -gram text frame work


In [53]:
comparison={'frequency':tert_frequency}
pd.DataFrame(comparison,index=tert_gram)

Unnamed: 0,frequency
d happy mother s,1
happy birthday little sister,1
tuner for my laptop,1
bought a tv tuner,1
birthday little sister of,1
...,...
so good glad you,1
sweet dreams olive juice,1
hope you have a,1
he he i deserve,1


# Numerify the Text Output of 'Sentiment'

In [56]:
sentiment=pd.get_dummies(data_train['sentiment'])
print(sentiment)

       negative  neutral  positive
0             0        1         0
1             0        0         1
2             0        1         0
3             1        0         0
4             0        0         1
...         ...      ...       ...
27481         0        1         0
27482         0        1         0
27483         0        1         0
27484         0        0         1
27485         0        0         1

[27485 rows x 3 columns]


Assigning sentiment information with values.

In [None]:
from nltk import FreqDist
import pickle
import sys
import utils

In [None]:
from collections import Counter