In [2]:
import warnings
warnings.filterwarnings('ignore')
import re
import string
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import pickle
import plotly.figure_factory as ff

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from imblearn.pipeline import Pipeline

import xgboost as xgb
from sklearn.naive_bayes import GaussianNB

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
import nltk
from nltk.corpus import stopwords

from sklearn.metrics import classification_report

In [4]:
# Defining the global variables for the color schemes we will incorporate
pblue = "#496595"
pb2 = "#85a1c1"
pb3 = "#3f4d63"
pg = "#c6ccd8"
pb = "#202022"
pbg = "#f4f0ea"

pgreen = px.colors.qualitative.Plotly[2]

In [12]:
df = pd.read_csv('tweets.csv')[['tweet', 'target']]
df.head(10)

Unnamed: 0,tweet,target
0,Today in Selfcare: beauty &amp; laughs Kung Fu...,0
1,I get to spend New Year's home again alone and...,1
2,"Depressed and lonely /: Stuck in a deep, never...",1
3,If this is your response to someone saying the...,0
4,Apparently you get a free pass just by mention...,0
5,When you will never again give birth to violen...,0
6,Learning to pretend to have a good time had be...,1
7,Aw man im outta pizza rolls,0
8,When you go out and try to be a part of life &...,0
9,So far he stop texting me…after I said somethi...,1


In [13]:
print(f'missing values count\n{df.isna().sum()}')

missing values count
tweet     0
target    0
dtype: int64


In [14]:
print('logest text length', np.max(df['tweet'].apply(lambda x: len(x.split())).values))

logest text length 61


In [15]:
# Checking balance of dataset
grouped_df = df.groupby('target').count().values.flatten()

fig = go.Figure()

fig.add_trace(go.Bar(
        x=['0'],
        y=[grouped_df[0]],
        name='Not-Depressed',
        text=[grouped_df[0]],
        textposition='auto',
        marker_color=pblue
))
fig.add_trace(go.Bar(
        x=['1'],
        y=[grouped_df[1]],
        name='Depressed',
        text=[grouped_df[1]],
        textposition='auto',
        marker_color=pg
))

fig.update_layout(
    title='Category distribution in the dataset')

fig.show()


In [16]:
# Creating series with length as index
# Sorting the series by index i.e. length
len_df_ham = df[df['target']=='0'].tweet.apply(lambda x: len(x.split())).value_counts().sort_index()
len_df_spam = df[df['target']=='1'].tweet.apply(lambda x: len(x.split())).value_counts().sort_index()


fig = go.Figure()
fig.add_trace(go.Scatter(
x=len_df_ham.index,
y=len_df_ham.values,
name='Not-Depressed',
fill='tozeroy',
marker_color=pblue))

fig.add_trace(go.Scatter(
x=len_df_spam.index,
y=len_df_spam.values,
name='Depressed',
fill='tozeroy',
marker_color=pg
))

fig.update_layout(
    title='Frequency of Tweets lengths')
fig.update_xaxes(range=[0, 80])
fig.show()


In [17]:
# emoticons
def load_dict_smileys():
    
    return {
        ":‑)":"smiley",
        ":-]":"smiley",
        ":-3":"smiley",
        ":->":"smiley",
        "8-)":"smiley",
        ":-}":"smiley",
        ":)":"smiley",
        ":]":"smiley",
        ":3":"smiley",
        ":>":"smiley",
        "8)":"smiley",
        ":}":"smiley",
        ":o)":"smiley",
        ":c)":"smiley",
        ":^)":"smiley",
        "=]":"smiley",
        "=)":"smiley",
        ":-))":"smiley",
        ":‑D":"smiley",
        "8‑D":"smiley",
        "x‑D":"smiley",
        "X‑D":"smiley",
        ":D":"smiley",
        "8D":"smiley",
        "xD":"smiley",
        "XD":"smiley",
        ":‑(":"sad",
        ":‑c":"sad",
        ":‑<":"sad",
        ":‑[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'‑(":"sad",
        ":'(":"sad",
        ":‑P":"playful",
        "X‑P":"playful",
        "x‑p":"playful",
        ":‑p":"playful",
        ":‑Þ":"playful",
        ":‑þ":"playful",
        ":‑b":"playful",
        ":P":"playful",
        "XP":"playful",
        "xp":"playful",
        ":p":"playful",
        ":Þ":"playful",
        ":þ":"playful",
        ":b":"playful",
        "<3":"love"
        }

# self defined contractions
def load_dict_contractions():
    
    return {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks"
        }

In [18]:
!pip install emoji



In [19]:
!pip install fasttext



In [20]:
import itertools
import emoji
def tweet_cleaning_for_sentiment_analysis(tweet):    
    
    #Special case not handled previously.
    tweet = tweet.replace('\x92',"'")
    #Removal of hastags/account
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", tweet).split())
    #Removal of address
    tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
    #Removal of Punctuation
    tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split())
    tweet = re.sub('\[.*?\]', '', tweet)
    tweet = re.sub('https?://\S+|www\.\S+', '', tweet)
    tweet = re.sub('<.*?>+', '', tweet)
    tweet = re.sub('[%s]' % re.escape(string.punctuation), '', tweet)
    tweet = re.sub('\n', '', tweet)
    tweet = re.sub('\w*\d\w*', '', tweet)
    #Lower case
    tweet = tweet.lower()
    #CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
    CONTRACTIONS = load_dict_contractions()
    tweet = tweet.replace("’","'")
    words = tweet.split()
    reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
    tweet = " ".join(reformed)
    # Standardizing words
    tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))
    #Deal with emoticons source: https://en.wikipedia.org/wiki/List_of_emoticons
    SMILEY = load_dict_smileys()  
    words = tweet.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    tweet = " ".join(reformed)
    #Deal with emojis
    tweet = emoji.demojize(tweet)
    tweet = tweet.replace(":"," ")
    tweet = ' '.join(tweet.split())

    return tweet

In [38]:
df['clean_tweet'] = df['tweet'].apply(tweet_cleaning_for_sentiment_analysis)

In [39]:
df.head()

Unnamed: 0,tweet,target,clean_tweet
0,Today in Selfcare: beauty &amp; laughs Kung Fu...,0,today in selfcare beauty amp laughs kung fu panda
1,I get to spend New Year's home again alone and...,1,i get to spend new years home again alone and ...
2,"Depressed and lonely /: Stuck in a deep, never...",1,depressed and lonely stuck in a deep never end...
3,If this is your response to someone saying the...,0,if this is your response to someone saying the...
4,Apparently you get a free pass just by mention...,0,apparently you get a free pass just by mention...


In [40]:
df.tail()

Unnamed: 0,tweet,target,clean_tweet
3195,"May the new year abound in H’s: Health, Hope, ...",0,may the new year abound in h's health hope hea...
3196,It is not a beautiful day as usual.,1,it is not a beautiful day as usual
3197,Compact Metal Leaf Grinder with Four Layers p...,0,compact metal leaf grinder with four layers pi...
3198,First Christmas in YEARS that I have to work. ...,0,first christmas in years that i have to work i...
3199,It's okay sus. Let that hurt go 🤗 * that's me ...,0,its okay sus let that hurt go hugging_face tha...


In [68]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [63]:
df.to_csv('preprocessed_tweets.csv', index=False)

In [30]:
def transform_instance(row):
    cur_row = []
    #Prefix the index-ed label with __label__
    label = "__label__" + row[1]  
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(tweet_cleaning_for_sentiment_analysis(row[2].lower())))
    return cur_row
 

In [65]:
with open('preprocessed_tweets.csv', 'r') as csvinfile: 
  csv_reader = csv.reader(csvinfile, delimiter=',')
  for i, row in enumerate(csv_reader):
      print(row[1])
      break

target


In [66]:
import csv
def preprocess(input_file, output_file, keep=1):
    i=0
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        with open(input_file, 'r') as csvinfile: 
            csv_reader = csv.reader(csvinfile, delimiter=',')
            for row in csv_reader:
                if row[1] in ['0','1'] and row[2]!='':
                    row_output = transform_instance(row)
                    csv_writer.writerow(row_output )
                    print(row_output)
                i=i+1
                if i%10000 == 0:
                    print(i)


In [None]:

# Preparing the training dataset        
preprocess('preprocessed_tweets.csv', 'tweets.train')

In [70]:
def upsampling(input_file, output_file, ratio_upsampling=1):
    # Create a file with equal number of tweets for each label
    #    input_file: path to file
    #    output_file: path to the output file
    #    ratio_upsampling: ratio of each minority classes vs majority one. 1 mean there will be as much of each class than there is for the majority class 
    
    i=0
    counts = {}
    dict_data_by_label = {}

    # GET LABEL LIST AND GET DATA PER LABEL
    with open(input_file, 'r', newline='') as csvinfile: 
        csv_reader = csv.reader(csvinfile, delimiter=',', quotechar='"')
        for row in csv_reader:
            counts[row[0].split()[0]] = counts.get(row[0].split()[0], 0) + 1
            if not row[0].split()[0] in dict_data_by_label:
                dict_data_by_label[row[0].split()[0]]=[row[0]]
            else:
                dict_data_by_label[row[0].split()[0]].append(row[0])
            i=i+1
            if i%10000 ==0:
                print("read" + str(i))

    # FIND MAJORITY CLASS
    majority_class=""
    count_majority_class=0
    for item in dict_data_by_label:
        if len(dict_data_by_label[item])>count_majority_class:
            majority_class= item
            count_majority_class=len(dict_data_by_label[item])  
    
    # UPSAMPLE MINORITY CLASS
    data_upsampled=[]
    for item in dict_data_by_label:
        data_upsampled.extend(dict_data_by_label[item])
        if item != majority_class:
            items_added=0
            items_to_add = count_majority_class - len(dict_data_by_label[item])
            while items_added<items_to_add:
                data_upsampled.extend(dict_data_by_label[item][:max(0,min(items_to_add-items_added,len(dict_data_by_label[item])))])
                items_added = items_added + max(0,min(items_to_add-items_added,len(dict_data_by_label[item])))

    # WRITE ALL
    i=0

    with open(output_file, 'w') as txtoutfile:
        for row in data_upsampled:
            txtoutfile.write(row+ '\n' )
            i=i+1
            if i%10000 ==0:
                print("writer" + str(i))


upsampling( 'tweets.train','uptweets.train')


In [78]:
import datetime
import fasttext
import os

# Full path to training data.
training_data_path ='uptweets.train' 
# validation_data_path ='tweets.validation'
model_path =''
model_name="depression_model"

def train():
    print('Training start')
    try:
        hyper_params = {"lr": 0.1,
                        "epoch": 30,
                        "wordNgrams": 2,
                        "dim": 20,
                        "minCount": 7}     
                               
        print(str(datetime.datetime.now()) + ' START=>' + str(hyper_params) )

        # Train the model.
        model = fasttext.train_supervised(input=training_data_path, **hyper_params)
        print("Model trained with the hyperparameter \n {}".format(hyper_params))

        # CHECK PERFORMANCE
        print(str(datetime.datetime.now()) + 'Training complete.' + str(hyper_params) )
        
        model_acc_training_set = model.test(training_data_path)
        # model_acc_validation_set = model.test(validation_data_path)
        
        # DISPLAY ACCURACY OF TRAINED MODEL
        #  ", validation:" + str(model_acc_validation_set[1]) + 
        text_line = str(hyper_params) + ",accuracy:" + str(model_acc_training_set[1])  +'\n' 
        print(text_line)
        
        #quantize a model to reduce the memory usage
        model.quantize(input=training_data_path, qnorm=True, retrain=True, cutoff=100000)
        
        print("Model is quantized!!")
        model.save_model(os.path.join(model_path,model_name + ".ftz"))                
    
        ##########################################################################
        #
        #  TESTING PART
        #
        ##########################################################################            
        print(model.predict(['today is already off to a great start face_with_rolling_eyes'],k=2))
        print(model.predict(['my wish for is that no one suffering from should ever feel alone that they would receive help love support kindness'],k=2))
        
    except Exception as e:
        print('Exception during training: ' + str(e) )


# Train your model.
train()

Training start
2021-09-11 16:52:22.074623 START=>{'lr': 0.1, 'epoch': 30, 'wordNgrams': 2, 'dim': 20, 'minCount': 7}
Model trained with the hyperparameter 
 {'lr': 0.1, 'epoch': 30, 'wordNgrams': 2, 'dim': 20, 'minCount': 7}
2021-09-11 16:52:22.885757Training complete.{'lr': 0.1, 'epoch': 30, 'wordNgrams': 2, 'dim': 20, 'minCount': 7}
{'lr': 0.1, 'epoch': 30, 'wordNgrams': 2, 'dim': 20, 'minCount': 7},accuracy:0.9717828131680205

Model is quantized!!
([['__label__0', '__label__1']], [array([1.0000100e+00, 1.0008666e-05], dtype=float32)])
([['__label__0', '__label__1']], [array([0.99855775, 0.0014622 ], dtype=float32)])


In [74]:
model = fasttext.load_model('model-en.ftz')



In [77]:
 model.predict("i feel so tired and stressed out today")

(('__label__1',), array([0.92439234]))