In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re
from nltk.corpus import stopwords

In [2]:
# Load Dataset and combined them to get insights
df = pd.read_csv('data/TRAIN/unlabeled_70k.csv')

In [3]:
df.head()

Unnamed: 0,hmid,moment,age,country,gender,married,parenthood,reflection,duration
0,27673,I went on a successful date with someone I fel...,35.0,USA,m,single,n,24h,at_least_one_hour
1,27675,I went to the gym this morning and did yoga.,30.0,USA,f,married,y,24h,at_least_one_hour
2,27678,I meditated last night.,23.0,IND,m,single,n,24h,at_least_one_hour
3,27679,"I made a new recipe for peasant bread, and it ...",30.0,USA,m,single,n,24h,half_a_day
4,27680,I got gift from my elder brother which was rea...,23.0,IND,m,single,n,24h,at_least_one_hour


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72324 entries, 0 to 72323
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   hmid        72324 non-null  int64  
 1   moment      72324 non-null  object 
 2   age         72228 non-null  float64
 3   country     72200 non-null  object 
 4   gender      72277 non-null  object 
 5   married     72229 non-null  object 
 6   parenthood  72274 non-null  object 
 7   reflection  72324 non-null  object 
 8   duration    72098 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 5.0+ MB


Only taking the hmid and moment columns from the dataset

In [5]:
df = df[['hmid','moment']]
df.head()

Unnamed: 0,hmid,moment
0,27673,I went on a successful date with someone I fel...
1,27675,I went to the gym this morning and did yoga.
2,27678,I meditated last night.
3,27679,"I made a new recipe for peasant bread, and it ..."
4,27680,I got gift from my elder brother which was rea...


### Checking for duplicates dataset

We check for ducplicates in the moment column, as we would be needing this column to train our models
The whole row here is not duplicate, but the values in the 'moment' column is, hence we need to eliminate them because we are not using any other feature to train the model but 'moment'

In [6]:
print(f'Shape of the Dataset before removing duplicates : {df.shape}')
duplicate = df[df.duplicated('moment')]
print(f'Duplicates found in the moment column : {duplicate.shape}')
duplicate

Shape of the Dataset before removing duplicates : (72324, 2)
Duplicates found in the moment column : (2491, 2)


Unnamed: 0,hmid,moment
266,28058,after 2 month i bought new tcl 32 inch led tv ...
412,28258,after 2 month i bought new tcl 32 inch led tv ...
867,28911,i was working on a problem from the morning wh...
1226,29413,i was working on a problem from the morning wh...
1797,30266,My boyfriend told me he loved me.
...,...,...
72156,128540,I spent time with my family.
72187,128578,I had sex with my wife.
72254,128681,I had sex with my girlfriend.
72289,128722,Purchased a Air Cooler for my office to work w...


Removing the duplicates from the 'moment' column

In [7]:
df.drop_duplicates(subset ="moment", inplace = True)
print(f'Shape of the Dataset after removing duplicates : {df.shape}')

Shape of the Dataset after removing duplicates : (69833, 2)


In [8]:
duplicate = df[df.duplicated('moment')]
print(f'Duplicates found in the moment column : {duplicate.shape}')

Duplicates found in the moment column : (0, 2)


In [9]:
df = df.reset_index()
df = df[['hmid', 'moment']]
df

Unnamed: 0,hmid,moment
0,27673,I went on a successful date with someone I fel...
1,27675,I went to the gym this morning and did yoga.
2,27678,I meditated last night.
3,27679,"I made a new recipe for peasant bread, and it ..."
4,27680,I got gift from my elder brother which was rea...
...,...,...
69828,128761,I spent time with my daughter.
69829,128762,My husband announced he is getting a decent bo...
69830,128763,Had a can of Pepsi to drink.
69831,128764,Cuddling with my girlfriend last night.


## Text Pre-Processing

During the Text Pre-processing, we did not include "PRONOUNS" in the list of stop words. This is because pronous in particular can contribute to identify agency and social labels. ("I, we, us ....")

In [10]:
# Pre processing the moment text
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence):
    stop_words = stopwords.words('english')
    # The reason for excluding these pronouns is discussed in the later section
    pronouns = ['mine','ours','our','it','my','i', 'you', 'he', 'they', 'we', 'she', 'who', 'them', 'me', 'him', 'one', 'her', 'us','himself','someone', 'themselves', 'everyone', 'itself', 'anyone', 'myself','hers','his','thiers','their','myself','yourself','ourselves','all','anybody']
    for word in pronouns:
        if word in stop_words:
            stop_words.remove(word)
    
    sent = ""
    for val in sentence.split():
        if val not in stop_words:
            sent += (val+" ")
    return sent

In [11]:
df['moment'] = df['moment'].str.lower()
df['moment'] = df['moment'].apply(cleanHtml)
df['moment'] = df['moment'].apply(cleanPunc)
df['moment'] = df['moment'].apply(keepAlpha)
df['moment'] = df['moment'].apply(removeStopWords)

In [12]:
df.reset_index()
display(df)
for i in range(0,5):
    print(df['moment'][i])

Unnamed: 0,hmid,moment
0,27673,i went successful date someone i felt sympathy...
1,27675,i went gym morning yoga
2,27678,i meditated last night
3,27679,i made new recipe peasant bread it came specta...
4,27680,i got gift my elder brother really surprising me
...,...,...
69828,128761,i spent time my daughter
69829,128762,my husband announced he getting decent bonus q...
69830,128763,pepsi drink
69831,128764,cuddling my girlfriend last night


i went successful date someone i felt sympathy connection 
i went gym morning yoga 
i meditated last night 
i made new recipe peasant bread it came spectacular 
i got gift my elder brother really surprising me 


### Writing this processed dataframes to a pickle file

In [13]:
df.to_pickle('unlabelled_data.pkl')