# Setup

## Import Libaries

In [1]:
import pandas as pd
import sqlite3
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

## Convert db file to Pandas Dataframe

In [2]:
con = sqlite3.connect('./discord scraper/Discord Scrapes/text.db')

In [3]:
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_name = cursor.fetchall()[0][0]

In [4]:
db = pd.read_sql_query(f"SELECT name, content, timestamp FROM {table_name}", con)

In [5]:
db

Unnamed: 0,name,content,timestamp
0,Spookies#3464,cosmetic items like the carrionette wig should...,2020-11-12T11:48:58.230000+00:00
1,basstomouth#8141,ne one wanna play some pubg? steam is basstomo...,2020-11-12T11:49:27.356000+00:00
2,SteveTheHappy🐳#0001,> is there any thread where we can report play...,2020-11-12T12:12:19.966000+00:00
3,762_k4tyusha#4435,"i mean, i reported all three of them but still...",2020-11-12T12:36:37.224000+00:00
4,762_k4tyusha#4435,"yall need to step up your AC game, since there...",2020-11-12T12:37:19.870000+00:00
...,...,...,...
7922,Deleted User#0000,best pubg rap hands down https://www.youtube.c...,2019-07-26T01:49:18.237000+00:00
7923,pinekel#9847,Fix the damn game before you add more shit,2019-07-26T01:54:18.494000+00:00
7924,Mary Ellen Katz#2904,"Dear pubg. When I'm trying to prone, but the a...",2019-07-26T01:59:31.709000+00:00
7925,Sylar#7230,anything we can do was working fine till 5 min...,2019-07-25T16:10:40.720000+00:00


In [6]:
db.to_csv('../data/scraped data/pubg_discord_2.csv')

# Data Preprocessing

# Bag of Words

In [6]:
count_vectorizer = CountVectorizer(analyzer='word', stop_words=stopwords.words('english'))
X_bag = pd.DataFrame(count_vectorizer.fit_transform(db['content']).toarray(),
                     columns=count_vectorizer.get_feature_names())

In [7]:
X_bag.shape

(2071, 3577)

In [8]:
len(X_bag.columns)

3577

In [38]:
X_bag

Unnamed: 0,00,000,01,02,022dcb46bceba00cc953aae4eaa13df7,03,04,06,07,09,...,zipline,zowie,zvfyr4n,zxvytel7ow4,αδερφε,αν,κανα,στειλε,φες,ⰴⱃⰴⰰ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2066,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2067,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2068,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Tf-idf

In [21]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopwords.words('english'))
X_tfidf = pd.DataFrame(tfidf_vectorizer.fit_transform(db['content']).toarray(),
                 columns = tfidf_vectorizer.get_feature_names())

In [22]:
X_tfidf.shape

(2071, 3577)

In [23]:
X_tfidf.head()

Unnamed: 0,00,000,01,02,022dcb46bceba00cc953aae4eaa13df7,03,04,06,07,09,...,zipline,zowie,zvfyr4n,zxvytel7ow4,αδερφε,αν,κανα,στειλε,φες,ⰴⱃⰴⰰ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
