# Preprocessing workflow
## Data 553 Group Project
### Chris Donoff, Wei Wei Liu, Bruno Santos, Alex Tamm

In [26]:
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)

import requests #must be installed for tense functions to work
import nltk
from nltk.corpus import stopwords #for stopword removal
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.parse import CoreNLPParser
import pandas as pd
import numpy as np

pd.__version__

u'0.24.2'

## 1. Load Our Test Data Set

In [27]:
#Load the csv of our data
filename = "sample_384_all_coded.csv"
delimiter = "\t".encode('utf-8')

df = pd.read_csv(filename, encoding="utf-8")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 16 columns):
id                384 non-null int64
appTitle          384 non-null object
userName          354 non-null object
date              384 non-null object
score             384 non-null int64
text              384 non-null object
fileDate          384 non-null object
fileCategories    384 non-null object
contentRating     384 non-null object
appId             384 non-null object
reviewId          384 non-null int64
processed_text    384 non-null object
label_UE          384 non-null int64
label_BR          384 non-null int64
label_FR          384 non-null int64
label_R           384 non-null int64
dtypes: int64(7), object(9)
memory usage: 48.1+ KB


## 2. Apply Stopword Removal to processed_text field

In [28]:
stop = stopwords.words('english')

df["stopwords_removal"] = df['processed_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df[["id","processed_text","stopwords_removal"]].head(3)

Unnamed: 0,id,processed_text,stopwords_removal
0,804084,"It's great, got lots of shows, 5 stars","It's great, got lots shows, 5 stars"
1,869576,Teacher Used the app to check on the status of...,Teacher Used app check status return. Informat...
2,425433,It was enjoyable and educative a good one.,It enjoyable educative good one.


## 3. Apply Lemmatization to processed_text and stopword_removal fields
### (Also count number of words in processed_text field and store in length_words)

In [29]:
wn_lemmatizer = nltk.stem.WordNetLemmatizer()

temp=[]
length=[]
for i in df.processed_text:
    ##tokenize
    title_token_list = nltk.word_tokenize(i)
    length.append(len(title_token_list))
    new=[]
    for word in title_token_list:
        ##lowercase
        word = word.lower()
        ##lemmatize verb.
        word = wn_lemmatizer.lemmatize(word,pos='v')
        ##lemmatize noun
        word = wn_lemmatizer.lemmatize(word) 
        ##lemmatize adj.
        word = wn_lemmatizer.lemmatize(word,pos='a')
        new.append(word)
    ##detokenize
    title_token_list= TreebankWordDetokenizer().detokenize(new)
    temp.append(title_token_list)
    
df['lemmatized_comment']=temp
df['length_words']=length

temp = []
for i in df.stopwords_removal:
    ##tokenize
    title_token_list = nltk.word_tokenize(i)
    new=[]
    for word in title_token_list:
        ##lowercase
        word = word.lower()
        ##lemmatize verb.
        word = wn_lemmatizer.lemmatize(word,pos='v')
        ##lemmatize noun
        word = wn_lemmatizer.lemmatize(word) 
        ##lemmatize adj.
        word = wn_lemmatizer.lemmatize(word,pos='a')
        new.append(word)
    ##detokenize
    title_token_list= TreebankWordDetokenizer().detokenize(new)
    temp.append(title_token_list)
df['stopwords_removal_lemmatization']=temp

df[["id","length_words","processed_text","lemmatized_comment","stopwords_removal","stopwords_removal_lemmatization"]].head(3)

Unnamed: 0,id,length_words,processed_text,lemmatized_comment,stopwords_removal,stopwords_removal_lemmatization
0,804084,11,"It's great, got lots of shows, 5 stars","it's great, get lot of show , 5 star","It's great, got lots shows, 5 stars","it's great, get lot show , 5 star"
1,869576,30,Teacher Used the app to check on the status of...,teacher use the app to check on the status of ...,Teacher Used app check status return. Informat...,teacher use app check status return . informat...
2,425433,9,It was enjoyable and educative a good one.,it be enjoyable and educative a good one.,It enjoyable educative good one.,it enjoyable educative good one.


## 4. Apply Tense to processed_text field

In [30]:
#This script counts the following types of tense and compares to values in Bug_tt:
#VBG = present_cont
#VB and VBZ= present_simple
#VBD and VBN = past tense
#future: MD = 'will' 'shall'

#the following script can be modified to read the processed_text column coming from our 542 data frame, and
#then add these counts as 4 new columns to the df. These 4 columns are in replic_tense


future = []
past = []
present_simple = []
present_con = []

for i in np.arange(len(df['processed_text'])):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    tagged_words = list(pos_tagger.tag(df['processed_text'][i].split()))

    future_count = 0
    past_count = 0
    present_simple_count = 0
    present_con_count = 0
    for i in np.arange(len(tagged_words)):
        if tagged_words[i][1] == 'MD' and (tagged_words[i][0] == 'will' or tagged_words[i][0] == 'shall'):
            future_count+= 1
        if (tagged_words[i][1] == 'VBD' or tagged_words[i][1] == 'VBN'):
            past_count+= 1
        if (tagged_words[i][1] == 'VBP' or tagged_words[i][1] == 'VBZ' or tagged_words[i][1] == 'VB'):
            present_simple_count+= 1
        if tagged_words[i][1] == 'VBG':
            present_con_count+= 1
            
    future.append(future_count)
    past.append(past_count)
    present_simple.append(present_simple_count)
    present_con.append(present_con_count)

tense_dict = {'future': future, 'past': past, 'present_simple': present_simple, 'present_con': present_con}     
replic_tense = pd.DataFrame(tense_dict)
replic_tense = replic_tense[['future', 'past', 'present_simple','present_con']]

df[['future','past', 'present_simple','present_con']] = replic_tense[['future','past','present_simple','present_con']]

ConnectionError: HTTPConnectionPool(host='localhost', port=9000): Max retries exceeded with url: /?properties=%7B%22ssplit.isOneSentence%22%3A+%22true%22%2C+%22outputFormat%22%3A+%22json%22%2C+%22annotators%22%3A+%22tokenize%2Cssplit%2Cpos%22%7D (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000000010D9A288>: Failed to establish a new connection: [Errno 10061] No connection could be made because the target machine actively refused it',))

## 5a. Sentiment (step 1/3): 
### extract data to feed into standalone SentiScore software 

In [31]:
#Apply sentiment to df (create sentiScore sentiScore_pos sentiScore_neg)
tempDF = df[["id","processed_text"]].copy()
tempDF["processed_text"]=tempDF["processed_text"].str.replace("\n"," ",regex=False)
tempDF["processed_text"]=tempDF["processed_text"].str.replace("\r"," ",regex=False)
tempDF["processed_text"]=tempDF["processed_text"].str.replace("\t"," ",regex=False)
tempDF["processed_text"]=tempDF["processed_text"].str.rstrip() #white space at the end of a processed_text was creating a new line upon export
tempDF["processed_text"]=tempDF["processed_text"].str.lstrip() #no reason to keep leading whitespaces, so stripping off
tempDF[["id","processed_text"]].to_csv(filename+"_for_senti.txt", header=True, index=False, sep=delimiter, encoding = "utf-8")
print(filename+"_for_senti.txt saved for input to sentiStrength")


sample_384_all_coded.csv_for_senti.txt saved for input to sentiStrength


## 5b. Sentiment (step 2/3): 
### use standalone SentiScore sofware with the following parameters/instructions:

(**This step must be performed manually outside of the notebook BUT the result file from the previous time this process was done is stored in the folder already so you can skip this step)**

Use SentiStrength 2.3 from (http://sentistrength.wlv.ac.uk/)
Use Sept 21, 2011 configuration files downloaded from same site
Use the following settings in menu:
![settings0](sshot0-senti_settings0.png)
![settings1](sshot1-senti_settings1.png)
![settings1](sshot2-senti_settings2.png)
![settings1](sshot3-senti_settings3.png)

- Use Sentiment Strength Analysis -> Analyze ALL Texts in File....As Above for ALL files in folder

- Select the file that was exported from the previous step ("sample_384_all_coded.csv_for_senti.txt")

- "Yes" when prompted if it should echo the header in the results

- When prompted which column contains text, enter "2" (do not use the default value of 3)

- Results will be saved with "+results" appended to the filename in the folder where the input file(s) were stored.


## 5c. Sentiment (step 3/3): read results back in and add to dataframe

In [32]:
#Calculates Sentiscore given SentiScore_pos and SentiScore_neg ratings (picks the value that is furthest from 0)
def find_max_sent(vect):
    pos=vect[0]
    neg=vect[1]
    if abs(pos)>abs(neg):
        return pos
    else:
        return neg #The authors of the paper appear to have used the negative score in the event of a tie



resultsDF = pd.read_csv(filename+"_for_senti+results.txt", sep=delimiter, encoding="utf-8")
resultsDF.rename(columns={"Positive":"sentiScore_pos","Negative":"sentiScore_neg"}, inplace=True)
df = df.merge(resultsDF[["id","sentiScore_pos","sentiScore_neg"]], on=["id"])

#use find_max_sent() function to assign a single sentiment score 
df["sentiScore"] = df[["sentiScore_pos","sentiScore_neg"]].apply(find_max_sent, axis=1)
#rename the other two sentiment score columns to match what they are called in the paper's original dataset
df.groupby(["sentiScore"]).id.count()
df[["id","processed_text","sentiScore","sentiScore_pos","sentiScore_neg"]].head()

Unnamed: 0,id,processed_text,sentiScore,sentiScore_pos,sentiScore_neg
0,804084,"It's great, got lots of shows, 5 stars",3,3,-1
1,869576,Teacher Used the app to check on the status of...,-1,1,-1
2,425433,It was enjoyable and educative a good one.,3,3,-1
3,855228,great way to send or receive money,3,3,-1
4,1010397,the amount of ads is ridiculous.,-3,1,-3


***

## 6. Rename and and/drop columns 
### (to match the structure of the input files used in the paper)

In [33]:
#Rename add/remove to match bug_tt.json

df.rename(columns={"processed_text":"comment","score":"rating","userName":"reviewer"}, inplace=True)

df.drop(columns=["appTitle","text","fileDate","fileCategories","contentRating"], inplace=True)

df['stemmed'] = np.nan
df['fee'] = np.nan
df['title'] = np.nan
df['dataSource'] = "Data542_Dataset"

df.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 384 entries, 0 to 383
Data columns (total 22 columns):
id                                 384 non-null int64
reviewer                           354 non-null object
date                               384 non-null object
rating                             384 non-null int64
appId                              384 non-null object
reviewId                           384 non-null int64
comment                            384 non-null object
label_UE                           384 non-null int64
label_BR                           384 non-null int64
label_FR                           384 non-null int64
label_R                            384 non-null int64
stopwords_removal                  384 non-null object
lemmatized_comment                 384 non-null object
length_words                       384 non-null int64
stopwords_removal_lemmatization    384 non-null object
sentiScore_pos                     384 non-null int64
sentiScore_neg          

## 7. Output to seperate json files based on the label
### (labels are: "Bug","Feature","UserExperience","Rating")

In [34]:
#Output as .json

colList = ["label_UE","label_BR","label_FR","label_R"]
labelNames = ["UserExperience","Bug","Feature","Rating"]
for n in range(0,len(colList)):
    print("\nProcessing:",labelNames[n])
    
    #Make a temp dataframe with all of the reviews of the specific label
    tempdf=df.drop(colList, axis=1)[df[colList[n]]==1]
    tempdf["label"] = labelNames[n]
    cnt = len(tempdf) #how many reviews are there of the specific label
    print("Number of",labelNames[n],"reviews available:",cnt)
    
    #Make a second temp dataframe with all of the reviews that are NOT the specific label
    notTempdf=df.drop(colList, axis=1)[df[colList[n]]==0]
    notTempdf["label"] = ("Not_"+labelNames[n])
    #Select only as many as are needed to match the amount of reviews that are for the label
    cnt2 = len(notTempdf)
    print("Number of NOT_"+labelNames[n],"reviews available:",cnt2)
    if cnt2 >= cnt:
        print("Selecting first",cnt,"NOT_"+labelNames[n],"reviews.")
        notTempdf=notTempdf.iloc[0:cnt].copy()
    else:
        print("**Only", cnt2, "NOT_"+labelNames[n], "reviews are available. Will use all of them and...")
        print("Removing",cnt-cnt2,labelNames[n],"reviews to create a balanced data set.")
        tempdf = tempdf.iloc[0:cnt2].copy()

    tempdf = tempdf.append(notTempdf)
    tempdf.to_json(labelNames[n]+"_ourdata.json", orient="records")



Processing: UserExperience
Number of UserExperience reviews available: 39
Number of NOT_UserExperience reviews available: 345
Selecting first 39 NOT_UserExperience reviews.

Processing: Bug
Number of Bug reviews available: 58
Number of NOT_Bug reviews available: 326
Selecting first 58 NOT_Bug reviews.

Processing: Feature
Number of Feature reviews available: 43
Number of NOT_Feature reviews available: 341
Selecting first 43 NOT_Feature reviews.

Processing: Rating
Number of Rating reviews available: 319
Number of NOT_Rating reviews available: 65
**Only 65 NOT_Rating reviews are available. Will use all of them and...
Removing 254 Rating reviews to create a balanced data set.
