# Import libraries

In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from spellchecker import SpellChecker


# Import dataset

In [2]:
commentsToScore = pd.read_csv("comments_to_score.csv")


In [3]:
commentsToScore

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...
...,...,...
7532,504235362,"Go away, you annoying vandal."
7533,504235566,This user is a vandal.
7534,504308177,""" \n\nSorry to sound like a pain, but one by f..."
7535,504570375,Well it's pretty fucking irrelevant now I'm un...


In [4]:
commentsToScore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7537 entries, 0 to 7536
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   comment_id  7537 non-null   int64 
 1   text        7537 non-null   object
dtypes: int64(1), object(1)
memory usage: 117.9+ KB


In [5]:
commentsToScore.describe()

Unnamed: 0,comment_id
count,7537.0
mean,223519200.0
std,138881700.0
min,114890.0
25%,98576620.0
50%,209314700.0
75%,332836600.0
max,504598200.0


# Text Cleaning

In [6]:
def text_preprocess(row):
    row = ' '.join([r for r in row.split() if r != ' ']) # removing multiple whitespaces in a row
    row = row.replace("\n", "") # removing newline character
    row = row.replace("\"","") # remove double quotes 

    # check spelling mistakes
    
    return row

commentsToScore['Cleaned Text'] = commentsToScore['text'].apply(text_preprocess)


In [7]:
commentsToScore['Cleaned Text']

0        Gjalexei, you asked about whether there is an...
1       Looks like be have an abuser , can you please ...
2       I confess to having complete (and apparently b...
3        Freud's ideas are certainly much discussed to...
4       It is not just you. This is a laundry list of ...
                              ...                        
7532                        Go away, you annoying vandal.
7533                               This user is a vandal.
7534     Sorry to sound like a pain, but one by follow...
7535    Well it's pretty fucking irrelevant now I'm un...
7536    The team name is Great Britain and Northern Ir...
Name: Cleaned Text, Length: 7537, dtype: object

In [8]:
%%time
def spellchecktextblob(row):
    spell = SpellChecker()
    listRow = list(set(spell.unknown(row.split(" "))))

    for word in listRow:
        #print(str(TextBlob(word).correct()))
        row.replace(word, str(TextBlob(word).correct()))

    return row

length = len(commentsToScore)
commentsToScore['Corrected spelling'] = ''
for i in range(7):
    batch_spellcheck = pd.DataFrame(commentsToScore['Cleaned Text'][i*1000:(i+1)*1000])
    df = batch_spellcheck['Cleaned Text'].apply(spellchecktextblob)
    commentsToScore['Corrected spelling'][i*1000:(i+1)*1000] = df

batch_spellcheck = pd.DataFrame(commentsToScore['Cleaned Text'][7000:])
df = batch_spellcheck['Cleaned Text'].apply(spellchecktextblob)
commentsToScore['Corrected spelling'][7000:] = df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


CPU times: user 46min, sys: 8.84 s, total: 46min 9s
Wall time: 46min 51s


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
commentsToScore['Corrected spelling'].tail(5)


7532                        Go away, you annoying vandal.
7533                               This user is a vandal.
7534     Sorry to sound like a pain, but one by follow...
7535    Well it's pretty fucking irrelevant now I'm un...
7536    The team name is Great Britain and Northern Ir...
Name: Corrected spelling, dtype: object

In [10]:
%%time
def spellchecklib(row):
    spell = SpellChecker()
    listRow = list(set(spell.unknown(row.split(" "))))

    for word in listRow:
        row.replace(word, spell.correction(word))

    return row

length = len(commentsToScore)
commentsToScore['Corrected spelling(pyspellcheck)'] = ''
for i in range(7):
    batch_spellcheck = pd.DataFrame(commentsToScore['Cleaned Text'][i*1000:(i+1)*1000])
    df = batch_spellcheck['Cleaned Text'].apply(spellchecktextblob)
    commentsToScore['Corrected spelling(pyspellcheck)'][i*1000:(i+1)*1000] = df

batch_spellcheck = pd.DataFrame(commentsToScore['Cleaned Text'][7000:])
df = batch_spellcheck['Cleaned Text'].apply(spellchecktextblob)
commentsToScore['Corrected spelling(pyspellcheck)'][7000:] = df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


CPU times: user 45min 55s, sys: 3.68 s, total: 45min 59s
Wall time: 57min 27s


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
print(commentsToScore['Corrected spelling(pyspellcheck)'])

0        Gjalexei, you asked about whether there is an...
1       Looks like be have an abuser , can you please ...
2       I confess to having complete (and apparently b...
3        Freud's ideas are certainly much discussed to...
4       It is not just you. This is a laundry list of ...
                              ...                        
7532                        Go away, you annoying vandal.
7533                               This user is a vandal.
7534     Sorry to sound like a pain, but one by follow...
7535    Well it's pretty fucking irrelevant now I'm un...
7536    The team name is Great Britain and Northern Ir...
Name: Corrected spelling(pyspellcheck), Length: 7537, dtype: object


In [12]:
commentsToScore.to_csv('cleanedtext.csv')