In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
with open('sa_data/tweets.txt') as f:
    lines=f.readlines()

In [4]:
json_list=[]
for i,line in enumerate(lines):
    json_line = json.loads(lines[i])
    json_list.append(json_line)

In [5]:
df = pd.DataFrame.from_records(json_list)

In [6]:
extended_tweets = df.loc[df['extended_tweet'].notna()]['extended_tweet'].reset_index()

In [7]:
full_tweets=[]
for tweet in extended_tweets['extended_tweet']:
    full_tweets.append(tweet['full_text'])

In [8]:
extended_tweets['full_tweet'] = full_tweets
extended_tweets.head()

Unnamed: 0,index,extended_tweet,full_tweet
0,9,{'full_text': '#Youtubers &amp; #Traders vs #b...,"#Youtubers &amp; #Traders vs #buyers, #bitcoin..."
1,17,{'full_text': '@jiims hi ate ayu :) idk but i ...,@jiims hi ate ayu :) idk but i really wanna tr...
2,19,{'full_text': 'AllDayClearFreshness WithDONNY ...,AllDayClearFreshness WithDONNY\n\n#DonnyForCle...
3,33,{'full_text': '@UbisoftMTL @banquenationale @A...,@UbisoftMTL @banquenationale @AstrolabeGames @...
4,36,{'full_text': '@phillingtime Assigning blame m...,@phillingtime Assigning blame means you put th...


In [9]:
# testing 
extended_tweets.loc[extended_tweets['index']==36, 'full_tweet'].values

array(['@phillingtime Assigning blame means you put the burden of responsibility on the person that did the action that led to what happened, yes? No need for insults :) \nConsent means you allow something knowing the potential risks… so you are responsible for the outcomes, no?'],
      dtype=object)

In [10]:
# extracting non-extended tweets
tweets = df['text'].to_frame()

In [11]:
tweets.head()

Unnamed: 0,text
0,. . .\n\nOhh no...
1,@Diceman27 @Streamboosts Dropped you a follow ...
2,@sakura_addicted But that was today @ midnight...
3,RT @brokenworld05: Promotion Time ⏳\n\nMentio...
4,marnie gave me the pouch from her forgotten wo...


In [12]:
full_tweets = pd.merge(tweets, extended_tweets, left_index=True, right_on='index', how='left')

In [13]:
# fill missings with text only
full_tweets['full_tweet'].fillna(full_tweets['text'], inplace=True)

In [14]:
full_tweets=full_tweets[['full_tweet']].copy()
full_tweets.columns=['tweet']

In [15]:
import re
# initiate 0, -1 for ':(' texts, +1 for ':)' texts
full_tweets['label']=2
full_tweets.loc[full_tweets["tweet"].str.contains(r":\("), 'label']=0
full_tweets.loc[full_tweets["tweet"].str.contains(r":\)"), 'label']=1# 

In [16]:
full_tweets.reset_index(drop=True, inplace=True)

In [17]:
full_tweets.label.value_counts()

1    2239
0     870
2     622
Name: label, dtype: int64

In [18]:
full_tweets=full_tweets.loc[full_tweets['label']!=2]

In [19]:
full_tweets.reset_index(drop=True, inplace=True)

In [20]:
full_tweets.head()

Unnamed: 0,tweet,label
0,@Diceman27 @Streamboosts Dropped you a follow ...,1
1,@sakura_addicted But that was today @ midnight...,0
2,RT @brokenworld05: Promotion Time ⏳\n\nMentio...,1
3,marnie gave me the pouch from her forgotten wo...,1
4,RT @CriticalError09: I hate sex but also I am ...,1


In [21]:
full_tweets[full_tweets.duplicated(subset='tweet')]

Unnamed: 0,tweet,label
22,RT @DONATORSPH: All of our tagline and hashtag...,1
23,RT @DONATORSPH: All of our tagline and hashtag...,1
24,RT @DONATORSPH: All of our tagline and hashtag...,1
50,RT @ItsKaranology: LOGY HAS LEFT THE PLANET :)...,1
52,RT @DONATORSPH: All of our tagline and hashtag...,1
...,...,...
3092,RT @legacy_sol: ⚡️FLASH GIVEAWAY⚡️\n\n0.1 $SOL...,1
3102,"RT @abcd_efghiloveu: i miss you, babe :(\n ...",0
3105,RT @Zoe2Freaky: Kinda new to nsfw twt \n\n💖22\...,1
3106,RT @jongseongflirts: this photo sequence :( ht...,0


In [22]:
full_tweets = full_tweets.drop_duplicates(subset='tweet', keep='first', ignore_index=True)

In [23]:
label = full_tweets['label'].values

In [24]:
full_tweets.shape

(2531, 2)

In [25]:
import string
string.punctuation
def del_punct(text):
    text="".join([char for char in text if char not in string.punctuation])
    return text

In [26]:
processed_tweets = []

for t in full_tweets['tweet']:
    # remove user names
    t = re.sub("@[\S]+", "", t)

    # lower text
    t = t.lower()
    
    # remove punct
    t = re.sub("[^\w\s]", "", t)

    # strip multiple white space to 1
    t = re.sub(r"\s+", " ", t)

    # strip leading white space 
    t = t.strip()
    
    # remove http
    t = re.sub("http\w+", "", t)

    # append to list
    processed_tweets.append(t)

In [27]:
from nltk.corpus import stopwords

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
vectorizer = TfidfVectorizer(
    max_features=2000,
    min_df=3,
    max_df=0.9,
    stop_words=stopwords.words('english')
)

In [33]:
feat_vects = vectorizer.fit_transform(processed_tweets)

In [34]:
feat_vects.shape

(2531, 1245)

In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels_encoded = le.fit_transform(label) # probably did not need to do

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(feat_vects, labels_encoded, test_size=0.2)

In [38]:
from sklearn import naive_bayes
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
clf = naive_bayes.MultinomialNB()
logReg = LogisticRegression()
rfc = RandomForestClassifier(max_depth=50, n_estimators=200)

In [39]:
clf.fit(X_train, y_train)

In [40]:
y_pred = clf.predict(X_test)

In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [58]:
train_pred = clf.predict(X_train)

In [59]:
accuracy_score(y_train, train_pred)

0.83399209486166

In [42]:
accuracy_score(y_test, y_pred)

0.7810650887573964

In [43]:
confusion_matrix(y_test, y_pred)

array([[ 41, 108],
       [  3, 355]])

In [46]:
logReg.fit(X_train, y_train)

In [47]:
accuracy_score(y_test, logReg.predict(X_test))

0.7652859960552268

In [48]:
rfc.fit(X_train, y_train)

In [49]:
accuracy_score(y_test, rfc.predict(X_test))

0.7652859960552268