In [176]:
import pandas as pd
import nltk
import re
import numpy as np
import os
import string
import gensim
import random
import ast
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

### Vader Feature

In [177]:
df1=pd.read_csv('Clean_v1.csv')
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
def vaderScore(comments):
    comments = re.sub('\w+:\/\/\S+','',comments)
    comments = re.sub('\.\.\.','', comments)
    comments = re.sub('[0-9]+','', comments)
    comments = re.sub('\|\|\|', '', comments)
    comments = sia.polarity_scores(comments)
    return comments

df1['vaderScore']=df1['posts'].apply(lambda x: vaderScore(x))
vaderDF =df1[["Unnamed: 0",'type','vaderScore']]

### Emoji Feature

In [178]:
df = pd.read_csv('clean_v2.csv')

In [179]:
words = df['New']
posts = []
for i in range(len(words)):
    pre = words[i]
    lst = eval(pre)
    post = ''
    for word in lst:
        post += word + ' '
    posts.append(post)

emotes_list = []
for i in posts:
    emotes = re.findall('\<.{2,25}?\>', i)
    for i in emotes:
        if i.count('<')+i.count('>')>2: #more filters
            emotes.remove(i)
    for j in emotes:
        emotes_list.append(j)
fd = nltk.FreqDist(emotes_list)
common_emojis = [x for x,y in fd.most_common(40)]

In [180]:
def emojiClean(comments):
    comments = re.sub("\[", '',comments)
    comments = re.sub("\]", '',comments)
    comments = re.sub("\'", '',comments)
    comments = re.sub("\,", '',comments)
    return comments

def feature_extractor(doc):
    features = {}
    for e in common_emojis:
        features[e] = (e in doc)
    return features

df['Emoji']=df['New'].apply(lambda x: emojiClean(x))
df['EmojiFeature'] = df['Emoji'].apply(lambda x: feature_extractor(x))

### Common ADJ Feature

In [181]:
stopwordlist = gensim.parsing.preprocessing.STOPWORDS
def cleaner(post):
    # remove words in < > or special symbols residuals
    special_symbols_remove = re.sub('\<[A-Za-z]+.[\s\w]*\>|\(|\)|\*|[A-Z]\/[A-Z]|\_+', '', post)
    wordlist = nltk.word_tokenize(special_symbols_remove)
    words_nonstop = [word.lower() for word in wordlist if word.lower() not in stopwordlist or word not in string.punctuation]
    words = [word for word in words_nonstop if len(word) > 3] # words that length than 3
    return words

def feature_common_adjs(words):
    tagged = nltk.pos_tag(words)
    adjs = [x for x, y in tagged if y.startswith('JJ') and x.isalpha()] # prevent uncleaned words
    common_adjs = [x for x, y in nltk.FreqDist(adjs).most_common(200)]
    return common_adjs

def check_adjs(post):
    adj_in_type = [w for w in post if w in adjs]
    return {'AdjInTop200 ': len(adj_in_type)}

df['Cleaner'] = df['Clean'].apply(lambda post: cleaner(post))
corpus = [word for post in df['Cleaner'] for word in post] # total 5994734
adjs = feature_common_adjs(corpus)
df['ADJs'] = df['Cleaner'].apply(lambda post: check_adjs(post))

### Most Common Feature

In [182]:
all_word = []
for i in range(len(df['Clean'])):
    word = cleaner(df['Clean'][i])
    for j in word:
        all_word.append(j)
       
fd = nltk.FreqDist(all_word)      
most_common = fd.most_common(200)
most = [z[0] for z in most_common]

# Set the feature
def most_feature(post):
    count = 0
    words = set(post)
    feature = {}
    for w in words:
        if w in most:
            count = count + 1
            feature['CommonWordCount'] = count
        else:
            feature['CommonWordCount'] = count
    return feature

df['MostCommonWord'] = df['Cleaner'].apply(lambda post: most_feature(post))


### Avg Length in the Sent

In [183]:
def avg_word(sent):
    str1 = sent.strip() # remove the space at start and end of string
    index = 0
    count = 0
    
    while index < len(str1):
        while str1[index] != " ": 
            index += 1   
            if index == len(str1): 
                break
        count += 1 # count number of word
        if index == len(str1): # check if the character is last one
            break
        while str1[index] == " ":  # check the space between the word
            index += 1
    
    # the number of sentence of post
    num_sent = len(re.split(r'[.!?]+',sent))
    # calculate the average word per sentence
    avg_word_length = count/num_sent

    return round(avg_word_length,1)

def avg_feature(num):
    feature = {} 
    feature['AvgSentLength'] = num
    return feature

df['AvgSentLength'] = df['Clean'].apply(lambda post: avg_word(post))
df['AvgSentLength'] = df['AvgSentLength'].apply(lambda post: avg_feature(post))

### TFIDF Feature

In [None]:
### Run if you have 18 hrs to spare.... if not just import from pickle file: PickleFeatures
vectorizer=TfidfVectorizer()
tfidf=vectorizer.fit_transform(posts)
num_word=len(vectorizer.get_feature_names())

def feature_function(post_index):
    sum_tfidf=sum([tfidf[post_index,word_index] for word_index in range(0,num_word)])
    length=len(nltk.word_tokenize(posts[post_index]))
    return {'avg_tfidf': round(sum_tfidf,3)}

lst =[]
for i in range(8675):
    lst.append(feature_function(i))
    
df["Avg TFIDF"]=lst

### AVG Char

In [184]:
def avgChar(doc):
    lst = eval(doc)
    lst = [x for x in lst if x.isalpha() and len(x)>2]
    word_no = len(lst)
    count = 0
    for i in lst:
        count += len(i)
    try:
        avgChar = count/word_no
    except ZeroDivisionError:
        avgChar = 0
    return {'AvgChar': round(avgChar,2)}
df['AvgChar'] = df['New'].apply(lambda x: avgChar(x))

### Joining the dataframe

In [185]:
#Merging the df and vaderDF because they are both using different clean files.
dfMerge = pd.merge(df, vaderDF, on=['Unnamed: 0','type'], how ='inner')

In [187]:
#Use this instead of running the AVG TFIDF FUNCTION
pickleDF = pd.read_pickle('PickleFeatures')
pickleDF =pickleDF[['Unnamed: 0','type','Avg TFIDF']]
dfMerge = pd.merge(dfMerge, pickleDF, on=['Unnamed: 0','type'], how ='inner')

In [199]:
#Subeset dfMerge and grabbed only the features ouput
featuresDF=dfMerge[['type','EmojiFeature','ADJs','AvgSentLength','MostCommonWord','AvgChar', 'Avg TFIDF','vaderScore']]
featuresDF.to_pickle("PickleFeaturesV2")

### Combing Features to Dict

In [None]:
df = pd.read_pickle('PickleFeaturesV2')

In [None]:
#Adding a new column to df
df['CombineDict']= ''
df

In [None]:
for i in range(len(df)):
    df['CombineDict'][i]= ({**df['EmojiFeature'][i], **df['ADJs'][i],**df['AvgSentLength'][i],**df['MostCommonWord'][i],
                            **df['AvgChar'][i], **df["Avg TFIDF"][i], **df['vaderScore'][i]}, df['type'][i])

In [None]:
df.to_pickle("PickleFeatureSet")