In [1]:
import re
import pandas as pd
import spacy
import numpy as np

In [2]:
nlp = spacy.load('en', disable=["ner", "parser"])

In [3]:
def cleaning(doc):
    """
    :param doc: spacy Doc object processed by the pipeline
    :return: Text lemmatized and without stopwords
    """
    txt = [token.lemma_ for token in doc if not token.is_stop]

    # Since training with small document don't make great benefits, they are ignored.
    if len(txt) > 2:
        return ' '.join(txt)

In [4]:
df = pd.read_csv('./../data/reddit/cm/MensRightsLaw_comments.csv')
df['subreddit'] = 'MensRightsLaw'

In [5]:
df2 = pd.read_csv('./../data/reddit/cm/MRActivism_comments.csv')
df2['subreddit'] = 'MRActivism'

In [6]:
df = df.append(df2)

In [7]:
df = df.append({'ID':'1', 'Comment':'1', 'Author':'1', 'Score':1, 'Parent id':'1', 'Publish Date':'1', 'subreddit':'1'}, ignore_index=True)

In [8]:
df.shape

(707, 7)

In [None]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['Comment'])

In [None]:
 txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=32, n_threads=16)]

In [None]:
 df_clean = pd.DataFrame({'clean': txt})

In [None]:
df_clean.tail()

Unnamed: 0,clean
702,te talk thumb nail like hand exactly like
703,live sacramento life super farfetched lol good...
704,number sentence think read
705,exact thing toronto year
706,


In [None]:
df_clean['index'] = np.arange(0, df_clean.shape[0], 1)

In [None]:
df_clean = df_clean.append({'clean':'b', 'index':709}, ignore_index=True)

In [None]:
df_clean = df_clean['clean'].dropna().drop_duplicates()

In [None]:
df_clean.tail()

702            te talk thumb nail like hand exactly like
703    live sacramento life super farfetched lol good...
704                           number sentence think read
705                             exact thing toronto year
707                                                    b
Name: clean, dtype: object

### Making the data frame completed

In [None]:
import pickle
import pandas as pd
import numpy as np

In [None]:
with open('df_clean.csv', 'rb') as fp:
    df_clean = pickle.load(fp)

In [None]:
sdf = pd.read_csv('./../subreddits.csv')
subreddits = sdf.values.tolist()
for s in subreddits:
        sub = str(s)[5:-5]

        temp_df = pd.read_csv(f'./../data/reddit/cm/{sub}_comments.csv')
        temp_df['subreddit'] = sub
        if s == subreddits[0]:
            df = temp_df
        else:
            df = df.append(temp_df, ignore_index=True)
        print(f'added {sub}')

In [None]:
df.head()

In [None]:
df_clean = df_clean.dropna().drop_duplicates()

In [None]:
df_clean['subreddit'] = df['subreddit']

In [None]:
df_clean['Date'] = df['Publish Date']

In [None]:
df_clean.tail()

In [None]:
df_clean.to_pickle(f"df_clean_sub_date.csv")