In [None]:
#!pip install fasttext



In [7]:
import pandas as pd
import fasttext
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.model_selection import train_test_split
import re

In [8]:
data = pd.read_csv('https://raw.githubusercontent.com/GregCollab/DataBDA/main/FirstSet.csv')

In [9]:
data.head(3)

Unnamed: 0.1,Unnamed: 0,Datetime,Usernames,Channels,Messages
0,0,2022-04-26T14:24:31.592519,forever_flat,thebausffs,b'Nerdge'
1,1,2022-04-26T14:24:31.592580,uperplexed,thebausffs,b'SUSSY'
2,2,2022-04-26T14:24:31.750518,johnbellotv,thebausffs,b'REAL VOICE OMEGALUL'


In [10]:
#We start by preprocessing the messages. This should be more refined in the future.
messages = data['Messages'].str.replace('b\'','') #Due to encoding, each message has a b' infront of it. We remove in this step.
labels = data['Channels'] 
ripper = lambda x: x[:-1] #Remove the ' at the end of the message as well
messages = messages.apply(ripper) 
 #This changes the message <-> channel relation to a word <-> channel relation. This is quite heavy.
splitter = lambda x: x.split() #This splits the message into the words
messages = messages.apply(splitter)
lens = [len(x) for x in messages]
Labeling = lens*(labels + " ").apply(splitter)
messages = [word for sentence in messages for word in sentence] #This splits the message into the words
Labeling = [name for all in Labeling for name in all]
stripper = lambda x: re.sub(r'[^\w]', '', x) #This removes everything that is not a number or a letter (both upper and lower case)
messages = pd.Series(messages).apply(stripper)
messages = messages.str.lower() #This puts everything to on the same case


In [11]:
print(list(zip(Labeling[0:5], messages[0:5])))

[('thebausffs', 'nerdge'), ('thebausffs', 'sussy'), ('thebausffs', 'real'), ('thebausffs', 'voice'), ('thebausffs', 'omegalul')]


In [12]:
message = list(messages.values)

In [13]:
message[0:5]

['nerdge', 'sussy', 'real', 'voice', 'omegalul']

In [14]:
with open("messages.txt", "w") as output:
    output.write(str(message))
    

In [15]:
import csv
Label = ['__label__'+x for x in  Labeling]
Labeled = pd.DataFrame({'Label':Label, 'Message':message})
Labeled_train, Labeled_test = train_test_split(Labeled, test_size=0.2)
Labeled_train.to_csv('Train.txt',                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")
Labeled_test.to_csv('Test.txt',                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")
    
    

In [16]:
 model = fasttext.train_supervised(input = 'Train.txt')

In [17]:
print( 'Number of obs, Precision, Recall')
model.test('Test.txt')


Number of obs, Precision, Recall


(38038, 0.7562700457437299, 0.7562700457437299)

In [18]:
model.predict('please')[1]

array([0.5865131])

In [19]:
model.predict('cs' )

(('__label__thebausffs',), array([0.99881029]))

In [20]:
model.save_model('ft.bin')

In [21]:
pca = PCA(n_components = 2)

In [22]:
model.get_word_vector('win')

array([-0.12028995,  0.09190637, -0.12189238, -0.009699  ,  0.09628513,
        0.08207635,  0.05665962,  0.01517751,  0.16794685,  0.11697018,
       -0.02156832,  0.05943175,  0.13271154,  0.09491484, -0.1059056 ,
       -0.01735095,  0.06030405,  0.08879631, -0.0534592 , -0.09918249,
       -0.06310323,  0.10910721,  0.05611694, -0.09240171,  0.04895224,
       -0.08923727, -0.0075494 , -0.06288004, -0.01994757,  0.07632409,
       -0.0704386 , -0.02300028, -0.0230711 , -0.09960596, -0.12249687,
       -0.01267282,  0.03379687, -0.071408  , -0.04139612, -0.11407963,
        0.12553278,  0.09309272,  0.03741778,  0.03954829, -0.03670964,
        0.01933279, -0.05607216, -0.03661919, -0.10884884, -0.00755672,
       -0.08345405,  0.09234697, -0.14408766,  0.09624885, -0.05434259,
       -0.09245604,  0.00145351,  0.04568502,  0.02852452,  0.0733336 ,
       -0.04699288,  0.05123389,  0.0934419 ,  0.08973122, -0.078373  ,
       -0.00051051, -0.06920914,  0.03623291,  0.09579483,  0.03

In [23]:
names = []
values = []

In [24]:
for item in model.words:
    names.append(item)
    values.append(model[item])
    

In [25]:
PrinComp = pca.fit_transform(values)

In [138]:
Examples = messages
H = pd.Series(Labeling).iloc[list(Examples.index)]
Vectorizer = lambda x: model.get_word_vector(x)
Res = Examples.apply(Vectorizer)
Comps = pca.transform(list(Res))
Frame = pd.DataFrame(
    {'Names': Examples, 'First Component': Comps[:, 0], 'Second Component': Comps[:, 1], "Channel Name":H})

In [39]:
fig = px.scatter(Frame, 'First Component', 'Second Component', color='Channel Name', hover_name='Names')
fig.update_layout(title_text = 'First two Principal Components', title_x=0.5, showlegend=True)

In [40]:
model.get_nearest_neighbors('sion', k=10)

[(0.9997475743293762, 'intining'),
 (0.9997473359107971, 'deathds'),
 (0.9997467994689941, 'lailai'),
 (0.9997437000274658, 'ebat'),
 (0.9997364282608032, 'electricute'),
 (0.9997360110282898, 'boomb'),
 (0.9997269511222839, 'igbtot'),
 (0.999725341796875, 'bcoach'),
 (0.9997232556343079, 'medikalmonday'),
 (0.9997227787971497, 'okkkkkkkkk')]

In [139]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [140]:
bauss = Examples[Comps[:, 0] > 1]
add = Examples[Comps[:, 0] < -1]

In [141]:
sid_obj = SentimentIntensityAnalyzer()


In [142]:
intl = []
for item in add:
    intl.append(sid_obj.polarity_scores(item)['neg'])
 

In [143]:
px.histogram(intl)

In [144]:
int2 = []
for item in bauss:
    int2.append(sid_obj.polarity_scores(item)['neg'])
 

In [145]:
np.mean(int2)

0.09316248293246508

In [146]:
np.mean(intl)

0.03415300546448088

In [100]:

sid_obj.polarity_scores('despair')


{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.3182}

In [137]:
bauss.sample(5)

10812                 krugsup
110930                 pagman
2543                     lolw
154217             thebausffs
32907     forsenlaughingatyou
dtype: object