# Preprocessing

This notebook is about visualizing, cleaning and preprocessing the dataset. 

In [3]:
import pandas as pd
from os import path
import glob

import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import PlaintextCorpusReader

#can be removed
from jupyterthemes import jtplot
jtplot.style()

## 1. Load raw data

### 1.1 Read data from csv files and save it as 1 file

In [8]:
if not path.exists("../data/raw_data.csv"):
    data_path = "../data/original-dataset"
    all_files = glob.glob(data_path + "/*.csv")
    li = []
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        # only keep x rows per file
        df = df[0:3000]
        
        li.append(df)

    frame = pd.concat(li, axis=0, ignore_index=True)
    frame.to_csv("../data/raw_data.csv", index=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if sys.path[0] == '':


### 1.2 Read from data/raw_data

In [9]:
df = pd.read_csv("../data/raw_data.csv")

#rename columns
df = df.rename(columns={"0": "text", "2": "subreddit", "3":"label"})

df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,text,1,10,11,subreddit,label,4,5,6,7,8,9,Unnamed: 0
0,,466jwy,1.0,,startrek,television,1455683402.0,JesseBotwin,1.0,0.0,2257.0,937.0,0
1,that sounds awesome i ll check it out tonight ...,d027xk0,0.0,,startrek,television,1455650159.0,notheebie,1.0,0.0,307.0,26352.0,1
2,star trek tos 50th anniversary tribute game n...,463gry,0.0,,startrek,television,1455643118.0,thisoldfart,3.0,0.0,3811.0,878.0,2
3,would nt spock have found it a bit insulting ...,461kw5,0.0,,startrek,television,1455611558.0,FPSD,18.0,0.0,1.0,1486.0,3
4,also what is a soul,d01pqtn,0.0,,startrek,television,1455611836.0,Realik,2.0,0.0,16429.0,14192.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,tastes like tobacco,d01kpyh,0.0,,tf2,gaming,1.4556e+09,Sentient545,5,0.0,1168.0,21241.0,2995
149996,http iimgurcombc26ygif,d01bp1k,0.0,,tf2,gaming,1.45558e+09,Rhymes_with_ike,13,0.0,11134.0,26374.0,2996
149997,https wikiteamfortresscomwimagesff2spy_taunts...,d01mbjk,0.0,,tf2,gaming,1.4556e+09,KourageousBagel,3,0.0,276.0,622.0,2997
149998,it s an oldie but it s a hell of a goodie,d01puwj,0.0,,tf2,gaming,1.45561e+09,--Benson,3,0.0,1.0,3576.0,2998


## 2. Basic cleaning

In [10]:
#only keep important columns
df = df[["text", "subreddit" ,"label"]]

# remove empty texts
df = df[(df["text"] != " removed ") & (df["text"] != " deleted ") & (df["text"] != np.nan)]

# remove NAN values
df = df.dropna()

# only keep at least 10 words per item
df = df[df["text"].str.count(' ') > 10]

# drop duplicates
df = df.drop_duplicates(subset="text")
df

Unnamed: 0,text,subreddit,label
1,that sounds awesome i ll check it out tonight ...,startrek,television
2,star trek tos 50th anniversary tribute game n...,startrek,television
3,would nt spock have found it a bit insulting ...,startrek,television
5,spock was half human and had worked for some t...,startrek,television
6,i got a notice from audible this afternoon tha...,startrek,television
...,...,...,...
149009,regular model https wikiteamfortresscomwim...,tf2,gaming
149010,my friend gave me a strange killstreak version...,tf2,gaming
149011,so i m pretty sad to know that the bottle does...,tf2,gaming
149013,fun fact if you die with the neon annihilator...,tf2,gaming


### 2.a Save cleaned data in csv file

In [11]:
df.to_csv("../data/cleaned_data.csv", index=False)