In [7]:
from cleantext import clean
import pandas as pd
import spacy

In [122]:
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 6500000

In [123]:
class DataCleaning:
    def __init__(self, path, chunksize=None):
        self.path = path
        self.chunksize = chunksize
        self.dataframe = pd.read_csv(self.path,chunksize=self.chunksize)
    
    def clean_text(self, text):
            return clean(text,
            fix_unicode=True,               # fix various unicode errors
            to_ascii=True,                  # transliterate to closest ASCII representation
            lower=True,                     # lowercase text
            no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
            no_urls=True,                  # replace all URLs with a special token
            no_emails=True,                # replace all email addresses with a special token
            no_phone_numbers=True,         # replace all phone numbers with a special token
            no_numbers=True,               # replace all numbers with a special token
            no_digits=True,                # replace all digits with a special token
            no_currency_symbols=True,      # replace all currency symbols with a special token
            no_punct=True,                 # fully remove punctuation
            replace_with_url="<URL>",
            replace_with_email="<EMAIL>",
            replace_with_phone_number="<PHONE>",
            replace_with_number="<NUMBER>",
            replace_with_digit="0",
            replace_with_currency_symbol="<CUR>",
            lang="en"                       # set to 'de' for German special handling
        )
        
    def sentence_tokenize(self, comment):
        doc = nlp(comment)
        comments=[]
        for i, token in enumerate(doc.sents):
            text = str(token.text)
            if len(text.split())<10:
                continue
            text = self.clean_text(text)
            comments.append(text)
        
        return ' '.join(comments) 
           
    def fit(self):
        self.data = []
        self.error = []
        count = 0
        for chunk in self.dataframe:
            for index, row in chunk.iterrows():
                temp = row
                try:
                    temp['comment'] = self.sentence_tokenize(str(row['comment']))
                except Exception:
                    self.error.append(temp)
                    continue
                self.data.append(temp)
                if count%self.chunksize==0:
                    print(count/self.chunksize, len(self.data))
                count+=1
    
    def export_csv(self,path,index=False):
        df = pd.DataFrame(self.data)
        df.to_csv(path,index)

In [124]:
path = "../data/Reddit-19K personality data/mbti9k_comments.csv"
dc = DataCleaning(path,chunksize=10)

In [125]:
dc.fit()

0.0 1
1.0 11
2.0 21
3.0 31
4.0 41
5.0 51
6.0 61
7.0 71
8.0 81
9.0 91
10.0 101
11.0 111
12.0 121
13.0 131
14.0 141
15.0 151
16.0 161
17.0 171
18.0 181
19.0 191
20.0 201
21.0 211
22.0 221
23.0 231
24.0 241
25.0 251
26.0 261
27.0 271
28.0 281
29.0 291
30.0 301
31.0 311
32.0 321
33.0 331
34.0 341
35.0 351
36.0 361
37.0 371
38.0 381
39.0 391
40.0 401
41.0 411
42.0 421
43.0 431
44.0 441
45.0 451
46.0 461
47.0 471
48.0 481
49.0 491
50.0 501
51.0 511
52.0 521
53.0 531
54.0 541
55.0 551
56.0 561
57.0 571
58.0 581
59.0 591
60.0 601
61.0 611
62.0 621
63.0 631
64.0 641
65.0 651
66.0 661
67.0 671
68.0 681
69.0 691
70.0 701
71.0 711
72.0 721
73.0 731
74.0 741
75.0 751
76.0 761
77.0 771
78.0 781
79.0 791
80.0 801
81.0 811
82.0 821
83.0 831
84.0 841
85.0 851
86.0 861
87.0 871
88.0 881
89.0 891
90.0 901
91.0 911
92.0 921
93.0 931
94.0 941
95.0 951
96.0 961
97.0 971
98.0 981
99.0 991
100.0 1001
101.0 1011
102.0 1021
103.0 1031
104.0 1041
105.0 1051
106.0 1061
107.0 1071
108.0 1081
109.0 1091
110.0 1101


765.0 7651
766.0 7661
767.0 7671
768.0 7681
769.0 7691
770.0 7701
771.0 7711
772.0 7721
773.0 7731
774.0 7741
775.0 7751
776.0 7761
777.0 7771
778.0 7781
779.0 7791
780.0 7801
781.0 7811
782.0 7821
783.0 7831
784.0 7841
785.0 7851
786.0 7861
787.0 7871
788.0 7881
789.0 7891
790.0 7901
791.0 7911
792.0 7921
793.0 7931
794.0 7941
795.0 7951
796.0 7961
797.0 7971
798.0 7981
799.0 7991
800.0 8001
801.0 8011
802.0 8021
803.0 8031
804.0 8041
805.0 8051
806.0 8061
807.0 8071
808.0 8081
809.0 8091
810.0 8101
811.0 8111
812.0 8121
813.0 8131
814.0 8141
815.0 8151
816.0 8161
817.0 8171
818.0 8181
819.0 8191
820.0 8201
821.0 8211
822.0 8221
823.0 8231


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [129]:
dc.error[0]

author                                                               Anen-o-me
comment                      But they all walked into WWI after a century o...
type                                                                      intj
subreddits_commented                                                       457
mbti_subreddits_commented                                                    4
wc                                                                      899724
comments_num                                                             18995
Name: 304, dtype: object

In [60]:
mbti_comments = pd.read_csv(path,chunksize=5)

In [65]:
for chunk in mbti_comments:
    for index, row in chunk.iterrows():
        print(row)
        break
    break

author                                                      HopefulRomantic527
comment                      You're in for an even bigger ride in the upcom...
type                                                                      infj
subreddits_commented                                                        58
mbti_subreddits_commented                                                    3
wc                                                                       18269
comments_num                                                               626
Name: 1735, dtype: object
