In [231]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
pd.set_option('display.max_colwidth', 1000)

In [232]:
df = pd.DataFrame()
for i in range(1,5):
    file_name = 'data/data_' + str(i) + '.csv'
    df_data = pd.read_csv(file_name)
    df = df.append(df_data)

In [233]:
def remove_terms(tweet, keyword, character):
    if keyword:
        for i in keyword:       
            tweet = tweet.replace(character+i.strip("''").lower(),'')
    return tweet

In [234]:
# Removing some columns about user information
df = df.drop(columns=['link','username','name_x'])

# 'tweets' column to lowercase
df['tweet'] = df['tweet'].str.lower()

# Transform the data type of some columns
df['label'] = df['label'].astype(int)

# Transform object type to list
df['hashtags'] = df['hashtags'].apply(lambda x: x.strip('][').split(', ') if x != '[]' else 0)
screen_name = re.compile("screen_name': '(\w+)'")
df['reply_to'] = df['reply_to'].apply(lambda x:screen_name.findall(x))

# Remove hashtags and reply_to from tweet
df['tweet'] = df.apply(lambda row:remove_terms(row['tweet'],row['hashtags'],'#'), axis=1)
df['tweet'] = df.apply(lambda row:remove_terms(row['tweet'],row['reply_to'],'@'), axis=1)

# Add a column about 'mention'
screen_name = re.compile("@(\w+)")
df['mention'] = df['tweet'].apply(lambda x:screen_name.findall(x))

# Remove mention from tweet
df['tweet'] = df.apply(lambda row:remove_terms(row['tweet'],row['mention'],'@'), axis=1)

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9150 entries, 0 to 2287
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              9150 non-null   object
 1   tweet             9150 non-null   object
 2   label             9150 non-null   int64 
 3   hashtags          9150 non-null   object
 4   day               9150 non-null   int64 
 5   hour              9150 non-null   int64 
 6   urls              9150 non-null   object
 7   photos            9150 non-null   object
 8   video             9150 non-null   int64 
 9   retweet           9150 non-null   bool  
 10  nlikes            9150 non-null   int64 
 11  nreplies          9150 non-null   int64 
 12  nretweets         9150 non-null   int64 
 13  quote_url         730 non-null    object
 14  reply_to          9150 non-null   object
 15  bio               8400 non-null   object
 16  url               3342 non-null   object
 17  join_date     

# PY section starts

In [5]:
df[["tweet","urls","photos","video","retweet","nlikes","nreplies","nretweets","quote_url","url"]]

Unnamed: 0,tweet,urls,photos,video,retweet,nlikes,nreplies,nretweets,quote_url,url
0,"No suicidal shit. But, why the fuck I can't fi...",[],[],0,False,0,0,0,,
1,I'm a veteran who was medically discharged for...,[],[],0,False,5,0,0,https://twitter.com/Newsweek/status/1364580837...,
2,"For me, your love is suicidal ????",[],[],0,False,0,0,0,,
3,not suicidal or anything but i genuinely can't...,[],[],0,False,136,10,38,,
4,Depression really made niggas think been suici...,[],[],0,False,2,0,1,,https://t.co/1TuMZ5O1eE
...,...,...,...,...,...,...,...,...,...,...
9145,"Years ago, in no small part as a response to m...",[],[],0,False,135,1,5,,https://t.co/iJrYlsmfrC
9146,It can't be helped... In order to stop you tha...,[],[],0,False,0,0,0,,https://t.co/77gfCAsKWb
9147,Bruh no suicidal shit but I swear my funeral g...,[],[],0,False,1,0,0,,
9148,Suicide information for those who are struggli...,[],['https://pbs.twimg.com/media/EvGmTqgXUAgYIsB....,1,False,57,1,35,,https://t.co/FtQpoISr3p


In [6]:
df.pivot_table(index="urls",values="tweet",aggfunc="count").sort_values(by=["tweet"],ascending = False)

Unnamed: 0_level_0,tweet
urls,Unnamed: 1_level_1
[],8618
['https://www.theguardian.com/environment/2021/feb/18/human-destruction-of-nature-is-senseless-and-suicidal-warns-un-chief'],10
['https://www.motherjones.com/politics/2021/02/human-destruction-of-nature-is-senseless-and-suicidal-says-un-chief/'],8
['https://twitter.com/richardking633/status/1361009024553152512?s=21'],5
['http://www.shiningalightonsuicide.org.uk'],5
...,...
['https://music.apple.com/us/album/lift-your-head-up-high-and-blow-your-brains-out/1440916408?i=1440916413'],1
['https://music.apple.com/us/album/i-saw-your-mommy/498742982?i=498743031'],1
['https://music.apple.com/br/album/teen-idle/612476580?i=612476590'],1
['https://mol.im/a/9290091'],1


In [7]:
# urls will be classified as 0 or 1
df["urls_new"] = df["urls"].apply(lambda x: "0" if x == "[]" else "1")

In [8]:
df[["urls_new","urls"]].head(100)

Unnamed: 0,urls_new,urls
0,0,[]
1,0,[]
2,0,[]
3,0,[]
4,0,[]
...,...,...
95,1,['https://m.independent.ie/irish-news/irish-da...
96,0,[]
97,0,[]
98,0,[]


In [9]:
# photos will be classified as 0 or 1
df.pivot_table(index="photos",values="tweet",aggfunc="count").sort_values(by=["tweet"],ascending = False)

Unnamed: 0_level_0,tweet
photos,Unnamed: 1_level_1
[],8580
['https://pbs.twimg.com/media/Eu0CYULVIAQVVL3.jpg'],2
['https://pbs.twimg.com/media/Eu_JUW3XIAQksmC.jpg'],2
['https://pbs.twimg.com/media/Eux_4jDXEAYVtdM.jpg'],1
['https://pbs.twimg.com/media/EvDFbvwXcAI9QYR.jpg'],1
...,...
['https://pbs.twimg.com/media/Eu8dG8dVoAUOO0Z.jpg'],1
['https://pbs.twimg.com/media/Eu8cCWDWYAYRAeP.jpg'],1
['https://pbs.twimg.com/media/Eu8_X6GXMAIHx5L.jpg'],1
['https://pbs.twimg.com/media/Eu8_QOAVEAI7hAJ.jpg'],1


In [10]:
# urls will be classified as 0 or 1
df["photos_new"] = df["photos"].apply(lambda x: "0" if x == "[]" else "1")

In [11]:
df[["photos_new","photos"]].head(100)

Unnamed: 0,photos_new,photos
0,0,[]
1,0,[]
2,0,[]
3,0,[]
4,0,[]
...,...,...
95,0,[]
96,0,[]
97,0,[]
98,0,[]


In [12]:
# video is 0 or 1 - no need to process
df.video.unique()

array([0, 1], dtype=int64)

In [13]:
# Retweet is always false - hence drop
df.retweet.unique()

array([False])

In [14]:
df.pivot_table(index="nlikes",values="tweet",aggfunc="count").sort_values(by=["tweet"],ascending = False)

Unnamed: 0_level_0,tweet
nlikes,Unnamed: 1_level_1
0,4066
1,1916
2,925
3,452
4,296
...,...
129,1
123,1
121,1
120,1


In [15]:
# need to scale
from sklearn import preprocessing

In [16]:
# need to scale

In [17]:
df.pivot_table(index="nreplies",values="tweet",aggfunc="count").sort_values(by=["tweet"],ascending = False)

Unnamed: 0_level_0,tweet
nreplies,Unnamed: 1_level_1
0,5224
1,2959
2,521
3,176
4,76
5,43
6,29
7,16
8,13
9,11


In [18]:
# Need to scale

In [19]:
df.pivot_table(index="nretweets",values="tweet",aggfunc="count").sort_values(by=["tweet"],ascending = False)

Unnamed: 0_level_0,tweet
nretweets,Unnamed: 1_level_1
0,7768
1,633
2,202
3,124
4,75
...,...
72,1
75,1
78,1
81,1


In [20]:
from sklearn.preprocessing import MinMaxScaler

In [21]:
scaler = preprocessing.MinMaxScaler()
columns_to_scale = ["nlikes","nreplies","nretweets"]
new_df = MinMaxScaler().fit_transform(df[columns_to_scale])
new_df= pd.DataFrame(new_df, columns=columns_to_scale)
for c in columns_to_scale:
    df[c] = new_df[c]

In [22]:
df[["nlikes","nreplies","nretweets"]].head(100)

Unnamed: 0,nlikes,nreplies,nretweets
0,0.000000,0.000000,0.000000
1,0.000055,0.000000,0.000000
2,0.000000,0.000000,0.000000
3,0.001501,0.014837,0.001903
4,0.000022,0.000000,0.000050
...,...,...,...
95,0.000188,0.005935,0.000250
96,0.000022,0.000000,0.000000
97,0.000011,0.000000,0.000000
98,0.000828,0.031157,0.000100


In [23]:
# 0 or blank will be 0; otherwise 1

In [24]:
df.pivot_table(index="quote_url",values="tweet",aggfunc="count").sort_values(by=["tweet"],ascending = False)

Unnamed: 0_level_0,tweet
quote_url,Unnamed: 1_level_1
https://twitter.com/kmintuition/status/1363531156319318025,100
https://twitter.com/Mature_Gambino/status/1363178193121906689,35
0,8
https://twitter.com/ThaboNdawonde/status/1364577465969041409,5
https://twitter.com/soursulley/status/1363369857165578244,4
...,...
https://twitter.com/Pays2Listen/status/1363317788165758977,1
https://twitter.com/PamelaBrownCNN/status/1363652534993358850,1
https://twitter.com/PTSDGuts2/status/1363208879845244931,1
https://twitter.com/POTUSJB4CTBT/status/1363308152159170561,1


In [25]:
df["quote_url"] = df["quote_url"].astype(str)

In [26]:
df["quote_url"] 

0                                                     nan
1       https://twitter.com/Newsweek/status/1364580837...
2                                                     nan
3                                                     nan
4                                                     nan
                              ...                        
9145                                                  nan
9146                                                  nan
9147                                                  nan
9148                                                  nan
9149                                                  nan
Name: quote_url, Length: 9150, dtype: object

In [27]:
df["quote_url_new"] = df["quote_url"].apply(lambda x: "0" if x == "nan" or x=="0" else "1")

In [28]:
df[["quote_url_new","quote_url"]].head(200)

Unnamed: 0,quote_url_new,quote_url
0,0,
1,1,https://twitter.com/Newsweek/status/1364580837...
2,0,
3,0,
4,0,
...,...,...
195,0,
196,0,
197,0,
198,0,


In [29]:
# Blank to be 0 or 1 otherwise

In [30]:
df["url"] 

0                           NaN
1                           NaN
2                           NaN
3                           NaN
4       https://t.co/1TuMZ5O1eE
                 ...           
9145    https://t.co/iJrYlsmfrC
9146    https://t.co/77gfCAsKWb
9147                        NaN
9148    https://t.co/FtQpoISr3p
9149                        NaN
Name: url, Length: 9150, dtype: object

In [31]:
df["url"] = df["url"].astype(str)

In [32]:
df["url"]

0                           nan
1                           nan
2                           nan
3                           nan
4       https://t.co/1TuMZ5O1eE
                 ...           
9145    https://t.co/iJrYlsmfrC
9146    https://t.co/77gfCAsKWb
9147                        nan
9148    https://t.co/FtQpoISr3p
9149                        nan
Name: url, Length: 9150, dtype: object

In [33]:
df.pivot_table(index="url",values="tweet",aggfunc="count").sort_values(by=["tweet"],ascending = False)

Unnamed: 0_level_0,tweet
url,Unnamed: 1_level_1
,5808
https://t.co/WCRfcb3mBb,26
https://t.co/cXnat2KfIq,20
https://t.co/ekaT92EzTE,12
https://t.co/0F6431eC5t,11
...,...
https://t.co/KOAcC3gD4j,1
https://t.co/KRMj0oPO6r,1
https://t.co/KSmDT4Xy2N,1
https://t.co/KUWhiSorou,1


In [34]:
df["url_new"] = df["url"].apply(lambda x: "0" if x == "nan" else "1")

In [35]:
df[["url","url_new"]].head(200)

Unnamed: 0,url,url_new
0,,0
1,,0
2,,0
3,,0
4,https://t.co/1TuMZ5O1eE,1
...,...,...
195,,0
196,,0
197,,0
198,,0


In [36]:
df["urls"] = df["urls_new"]
df["photos"] = df["photos_new"]
df["quote_url"] = df["quote_url_new"]
df["url"] = df["url_new"]

In [37]:
df.drop(['urls_new',"url_new","photos_new","retweet","quote_url_new"], axis='columns', inplace=True)

In [38]:
df[["tweet","urls","photos","video","nlikes","nreplies","nretweets","quote_url","url"]]

Unnamed: 0,tweet,urls,photos,video,nlikes,nreplies,nretweets,quote_url,url
0,"No suicidal shit. But, why the fuck I can't fi...",0,0,0,0.000000,0.000000,0.000000,0,0
1,I'm a veteran who was medically discharged for...,0,0,0,0.000055,0.000000,0.000000,1,0
2,"For me, your love is suicidal ????",0,0,0,0.000000,0.000000,0.000000,0,0
3,not suicidal or anything but i genuinely can't...,0,0,0,0.001501,0.014837,0.001903,0,0
4,Depression really made niggas think been suici...,0,0,0,0.000022,0.000000,0.000050,0,1
...,...,...,...,...,...,...,...,...,...
9145,"Years ago, in no small part as a response to m...",0,0,0,0.001490,0.001484,0.000250,0,1
9146,It can't be helped... In order to stop you tha...,0,0,0,0.000000,0.000000,0.000000,0,1
9147,Bruh no suicidal shit but I swear my funeral g...,0,0,0,0.000011,0.000000,0.000000,0,0
9148,Suicide information for those who are struggli...,0,1,1,0.000629,0.001484,0.001752,0,1


# PY section ends