In [73]:
import re
import pandas as pd
import spacy
import numpy as np

In [20]:
nlp = spacy.load('en', disable=["ner", "parser"])

In [149]:
def cleaning(doc):
    """
    :param doc: spacy Doc object processed by the pipeline
    :return: Text lemmatized and without stopwords
    """
    txt = [token.lemma_ for token in doc if not token.is_stop]

    # Since training with small document don't make great benefits, they are ignored.
    if len(txt) > 2:
        return ' '.join(txt)

In [150]:
df = pd.read_csv('./../data/reddit/cm/MensRightsLaw_comments.csv')
df['subreddit'] = 'MensRightsLaw'

In [151]:
df2 = pd.read_csv('./../data/reddit/cm/MRActivism_comments.csv')
df2['subreddit'] = 'MRActivism'

In [152]:
df = df.append(df2)

In [153]:
df = df.append({'ID':'1', 'Comment':'1', 'Author':'1', 'Score':1, 'Parent id':'1', 'Publish Date':'1', 'subreddit':'1'}, ignore_index=True)

In [154]:
df.shape

(707, 7)

In [155]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['Comment'])

In [156]:
 txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=32, n_threads=16)]

In [158]:
 df_clean = pd.DataFrame({'clean': txt})

In [160]:
df_clean.tail()

Unnamed: 0,clean
702,te talk thumb nail like hand exactly like
703,live sacramento life super farfetched lol good...
704,number sentence think read
705,exact thing toronto year
706,


In [125]:
df_clean['index'] = np.arange(0, df_clean.shape[0], 1)

In [94]:
df_clean = df_clean.append({'clean':'b', 'index':709}, ignore_index=True)

In [100]:
df_clean = df_clean['clean'].dropna().drop_duplicates()

In [101]:
df_clean.tail()

602    live sacramento life super farfetched lol good...
603                           number sentence think read
604                             exact thing toronto year
605                                                    a
607                                                    b
Name: clean, dtype: object

### Making the data frame completed

In [14]:
import pickle
import pandas as pd
import numpy as np

In [39]:
with open('df_clean.csv', 'rb') as fp:
    df_clean = pickle.load(fp)

In [6]:
sdf = pd.read_csv('./../subreddits.csv')
subreddits = sdf.values.tolist()
for s in subreddits:
        sub = str(s)[5:-5]

        temp_df = pd.read_csv(f'./../data/reddit/cm/{sub}_comments.csv')
        temp_df['subreddit'] = sub
        if s == subreddits[0]:
            df = temp_df
        else:
            df = df.append(temp_df, ignore_index=True)
        print(f'added {sub}')

added MGTOW
added exredpill
added RedPillParenting
added redpillbooks
added TheRedPill
added RedPillWomen
added asktrp
added thankTRP
added becomeaman
added GEOTRP
added TRPOffTopic
added Braincels
added askanincel
added BlackPillScience
added IncelsWithoutHate
added ForeverAlone
added MensRightsLaw
added MRActivism
added FeMRA
added LadyMRAs
added Masculism
added MensRants
added MRRef
added FeMRADebates
added againstmensrights
added TheBluePill


In [12]:
df.head()

Unnamed: 0,ID,Comment,Author,Score,Parent id,Publish Date,subreddit
0,c5eb3m3,[deleted],[deleted],1,t3_wlcy9,2012-07-15 11:07:27,MGTOW
1,c5eey4f,[deleted],[deleted],3,t3_wlscd,2012-07-15 16:41:13,MGTOW
2,c5ef264,Statism and socialism are intrinsically anti-m...,gege33,3,t3_wlomv,2012-07-15 16:49:37,MGTOW
3,c5efadr,[deleted],[deleted],5,t3_wlp3l,2012-07-15 17:07:08,MGTOW
4,c5effq0,[deleted],[deleted],3,t3_wltx8,2012-07-15 17:18:40,MGTOW


In [40]:
df_clean = df_clean.dropna().drop_duplicates()

In [41]:
df_clean['subreddit'] = df['subreddit']

In [42]:
df_clean['Date'] = df['Publish Date']

In [44]:
df_clean.tail()

Unnamed: 0,clean,subreddit,Date
13761265,interesting point actually consider,TheBluePill,2019-05-06 23:28:10
13761266,guy actually pay money convention tell chad ...,TheBluePill,2019-05-06 23:51:00
13761267,swindle pretend call grift con,TheBluePill,2019-05-07 00:13:36
13761268,nah properly convention bring pua unironically...,TheBluePill,2019-05-07 00:17:20
13761269,grift griftman,TheBluePill,2019-05-07 00:44:27


In [45]:
df_clean.to_pickle(f"df_clean_sub_date.csv")