## Loader les données

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import PlaintextCorpusReader
from os import path
import glob

In [2]:
if not path.exists("raw_data.csv"):
    data_path = "reddit-dataset/data"
    all_files = glob.glob(data_path + "/*.csv")
    li = []
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        # only keep 200 rows per file
        df = df[0:200]
        li.append(df)

    frame = pd.concat(li, axis=0, ignore_index=True)
    frame.to_csv("raw_data.csv", index=False)


In [3]:
df = pd.read_csv("raw_data.csv")

In [4]:
df

Unnamed: 0.1,0,1,10,11,2,3,4,5,6,7,8,9,Unnamed: 0
0,my neighbor has moss on her clitoris,d02ojhe,1.0,,funny,humor,1455673908.0,FLGulf,1.0,0.0,64.0,144312.0,0
1,lettuce not make to many jokes,d02iumx,0.0,,funny,humor,1455665090.0,Looksatducks,1.0,0.0,1.0,48.0,1
2,christopher columbus also sold children as sex...,d02kuqg,0.0,,funny,humor,1455668295.0,Malkalack,2.0,0.0,1478.0,19270.0,2
3,rthathappened,d02uf4u,1.0,,funny,humor,1455683493.0,WadeWilsonforPope,-3.0,0.0,675.0,286907.0,3
4,gt 734 nothing good on huh,d02p7to,0.0,,funny,humor,1455674936.0,JoeyFatts,1.0,0.0,21259.0,23074.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,do daily palpitations with no known cause rais...,d00p2v6,0.0,,science,learning,1455549959.0,JimJamJamboree,6.0,0.0,1.0,59.0,195
9996,what are a few diet and lifestyle changes one ...,d00lxua,0.0,,science,learning,1455543369.0,vasdak,4.0,0.0,1.0,512.0,196
9997,,465une,0.0,,science,learning,1455673666.0,ninthinning01,2.0,0.0,15156.0,2010.0,197
9998,gt we know from satellite observations that ...,d02oej8,0.0,,science,learning,1455673701.0,ninthinning01,0.0,0.0,15156.0,2010.0,198


In [5]:
df = df.rename(columns={"0": "text", "2": "subreddit", "3":"label"})
df

Unnamed: 0.1,text,1,10,11,subreddit,label,4,5,6,7,8,9,Unnamed: 0
0,my neighbor has moss on her clitoris,d02ojhe,1.0,,funny,humor,1455673908.0,FLGulf,1.0,0.0,64.0,144312.0,0
1,lettuce not make to many jokes,d02iumx,0.0,,funny,humor,1455665090.0,Looksatducks,1.0,0.0,1.0,48.0,1
2,christopher columbus also sold children as sex...,d02kuqg,0.0,,funny,humor,1455668295.0,Malkalack,2.0,0.0,1478.0,19270.0,2
3,rthathappened,d02uf4u,1.0,,funny,humor,1455683493.0,WadeWilsonforPope,-3.0,0.0,675.0,286907.0,3
4,gt 734 nothing good on huh,d02p7to,0.0,,funny,humor,1455674936.0,JoeyFatts,1.0,0.0,21259.0,23074.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,do daily palpitations with no known cause rais...,d00p2v6,0.0,,science,learning,1455549959.0,JimJamJamboree,6.0,0.0,1.0,59.0,195
9996,what are a few diet and lifestyle changes one ...,d00lxua,0.0,,science,learning,1455543369.0,vasdak,4.0,0.0,1.0,512.0,196
9997,,465une,0.0,,science,learning,1455673666.0,ninthinning01,2.0,0.0,15156.0,2010.0,197
9998,gt we know from satellite observations that ...,d02oej8,0.0,,science,learning,1455673701.0,ninthinning01,0.0,0.0,15156.0,2010.0,198


In [6]:
df = df[["text", "subreddit" ,"label"]]
df

Unnamed: 0,text,subreddit,label
0,my neighbor has moss on her clitoris,funny,humor
1,lettuce not make to many jokes,funny,humor
2,christopher columbus also sold children as sex...,funny,humor
3,rthathappened,funny,humor
4,gt 734 nothing good on huh,funny,humor
...,...,...,...
9995,do daily palpitations with no known cause rais...,science,learning
9996,what are a few diet and lifestyle changes one ...,science,learning
9997,,science,learning
9998,gt we know from satellite observations that ...,science,learning


In [7]:
df = df[(df["text"] != " removed ") & (df["text"] != " deleted ") & (df["text"] != np.nan)]
df

Unnamed: 0,text,subreddit,label
0,my neighbor has moss on her clitoris,funny,humor
1,lettuce not make to many jokes,funny,humor
2,christopher columbus also sold children as sex...,funny,humor
3,rthathappened,funny,humor
4,gt 734 nothing good on huh,funny,humor
...,...,...,...
9995,do daily palpitations with no known cause rais...,science,learning
9996,what are a few diet and lifestyle changes one ...,science,learning
9997,,science,learning
9998,gt we know from satellite observations that ...,science,learning


In [8]:
df =df.dropna()
df

Unnamed: 0,text,subreddit,label
0,my neighbor has moss on her clitoris,funny,humor
1,lettuce not make to many jokes,funny,humor
2,christopher columbus also sold children as sex...,funny,humor
3,rthathappened,funny,humor
4,gt 734 nothing good on huh,funny,humor
...,...,...,...
9993,most american schools will suspend not only th...,science,learning
9995,do daily palpitations with no known cause rais...,science,learning
9996,what are a few diet and lifestyle changes one ...,science,learning
9998,gt we know from satellite observations that ...,science,learning


In [None]:
pd.get_dummies(df["subreddit"],prefix=['country'])

## Même approche que le tp2

In [9]:
from nltk.stem.snowball import SnowballStemmer
import string
from nltk.corpus import stopwords

In [10]:
class Preprocessor:
    def __init__(self, stem=True, punctuation=True, digit=True):
        self.st = SnowballStemmer("french", ignore_stopwords=True)
        self.punctuations = set(string.punctuation)
        self.stem = stem
        self.punctuation = punctuation
        self.digit = digit
    
    def raw_preprocess(self, token_list):
        new_token_list = []
        for token in token_list:
            is_token_valid = True
            token = token.lower()
            
            # stemming
            if self.stem:
                token = self.st.stem(token)

            # punctuation filtering
            if self.punctuation and token in self.punctuations:
                is_token_valid = False

            # digit filtering
            elif self.digit and token.isdigit():
                is_token_valid = False

            # add token to new sub list
            if is_token_valid:
                new_token_list.append(token)
        return new_token_list
        
    def preprocess_tokens(self, tokens):
        new_tokens = []
        for token_list in tokens:
            new_tokens.append(self.raw_preprocess(token_list))
        return new_tokens
    
def preprocess(tokens):
    preprocessor = Preprocessor()
    return preprocessor.preprocess_tokens(test)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

def identity_tokenizer(text):
    return text

preprocessor = Preprocessor()
vectorizer = TfidfVectorizer(preprocessor=preprocessor.raw_preprocess ,tokenizer=identity_tokenizer)
#vectorizer = TfidfVectorizer(preprocessor=preprocessor.raw_preprocess)
IDF_data = vectorizer.fit_transform(df["text"].values)

In [15]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=26, algorithm='randomized', n_iter=100)
svd.fit(IDF_data)

TruncatedSVD(algorithm='randomized', n_components=26, n_iter=100,
             random_state=None, tol=0.0)

In [16]:
sum(svd.explained_variance_)

0.2144968389900094

In [17]:
SVD_data = svd.transform(IDF_data)