# Cleaning

Import relevant packages:
- Here we will include the big three: pandas, numpy, and matplotlib
- BeautifulSoup for cleaning html artifacts from our data

In [1]:
import pandas as pd
import numpy as np
import regex as re
import matplotlib.pyplot as plt


from bs4 import BeautifulSoup

from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

from urllib.parse import urlparse

### Read in the Data

In [2]:
# Read in Machine learning and datascience data
mc = pd.read_csv('./data/machinelearning_1.csv')
ds = pd.read_csv('./data/datascience_1.csv')

In [3]:
# Concatenate the data
df = pd.concat([mc,ds])
df.reset_index(drop=True, inplace = True)

In [4]:
df.head()

Unnamed: 0,title,selftext,created_utc,num_comments,num_crossposts,score,subreddit
0,[D] Hinton responds to Schmidhuber,,1587609168,0,0,1,MachineLearning
1,Hinton responds to Schmidhuber,,1587609111,1,0,1,MachineLearning
2,"[D] Other than vectorization, what other aspec...",I'm helping a friend design a course with dual...,1587606108,2,0,1,MachineLearning
3,Survey for IT Employees working from home! Hel...,,1587604741,2,0,1,MachineLearning
4,[R] Chip Placement with Deep Reinforcement Lea...,,1587604558,1,0,1,MachineLearning


### Cleaning [deleted] and [removed] rows from title and selftext

There are still some remaining rows in title and selftext that have some deleted and removed rows.

Lets create a mask that looks for them and then drops them.

In [5]:
# Create a boolean mask for titles that are removed and deleted
title_removed = (df['title']=='[removed]')
title_deleted = (df['title']=='[deleted]')

# Create a boolean mask for subtexts that are removed and deleted
selftext_removed = (df['selftext']=='[removed]')
selftext_deleted = (df['selftext']=='[deleted]')

# Check if there are any removed or deleted values. 
#  Remove them

if (len(df[title_removed]) + len(df[title_deleted]) + 
     len(df[selftext_removed]) + len(df[selftext_deleted])) > 0:
    
    # Remove rows with '[deleted]' as the title or selftext
    df.drop(labels = df[selftext_deleted].index, axis = 0, inplace=True)
    df.reset_index(drop=True, inplace = True)

In [6]:
a = 'string'
a = ''
not a

True

In [7]:
df['selftext']

0                                                      NaN
1                                                      NaN
2        I'm helping a friend design a course with dual...
3                                                      NaN
4                                                      NaN
                               ...                        
39808                                                  NaN
39809                                                  NaN
39810                                                  NaN
39811                                                  NaN
39812    Hi tech geeks need your advice as solid primer...
Name: selftext, Length: 39813, dtype: object

In [8]:
#str(df['selftext'][0]) == 'nan'
pd.isnull(df['selftext'][0])

True

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39813 entries, 0 to 39812
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           39813 non-null  object
 1   selftext        23811 non-null  object
 2   created_utc     39813 non-null  int64 
 3   num_comments    39813 non-null  int64 
 4   num_crossposts  39813 non-null  int64 
 5   score           39813 non-null  int64 
 6   subreddit       39813 non-null  object
dtypes: int64(4), object(3)
memory usage: 2.1+ MB


### Clean out each review

In [10]:
# function that runs over reviews 

def review_to_words(raw_review):
    if pd.isnull(raw_review):
        return("")
    else:
        # Remove 
        review_text = BeautifulSoup(raw_review).get_text()
        letters_only = re.sub("[^a-zA-Z]", " ", review_text)
        #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', letters_only, flags=re.MULTILINE)
        text = re.sub(r'(\(https:\/\/[^\s]+)|https:\/\/[^\s]+', '', letters_only, flags=re.MULTILINE)
        words = text.lower().split()
        stops = set(stopwords.words('english'))
        meaningful_words = [w for w in words if w not in stops]
    return(" ".join(meaningful_words))

In [11]:
%%time

# Running through review_to_words to clean each row
df.insert(2,column = 'clean_title',value = [review_to_words(element) for element in df['title']])
df.insert(3,column = 'clean_selftext',value = [review_to_words(element) for element in df['selftext']])

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Sou

CPU times: user 54.2 s, sys: 10.8 s, total: 1min 5s
Wall time: 1min 25s


In [12]:
#issue = 'https://www.tiki.systems/'
#fake = 'asldfkjas lskdjf df'
#raw_review = 'here is a sample URL https://lzone.de/examples/Python%20re.sub or how about this https://docs.python.org/3/library/re.html'

def review_to_url(raw_review):
    if pd.isnull(raw_review):
        return("")
    else:
        domain_string = ' '
        domains = []
        urls = re.findall('https:\/\/[^\s]+', raw_review)
        for url in urls:
            try:
                hostname = urlparse(url).hostname
            except:
                hostname = ''
            domains.append(hostname)
    return domain_string.join(domains)

#print(review_to_url(issue))
#print(review_to_url(fake))
#print(review_to_url(raw_review))

In [13]:
list_of_urls = [review_to_url(element) for element in df['selftext']]

In [14]:
%%time
# Running through review_to_urls to extract hostnames from title and selftext
df.insert(3,column = 'title_urls',value = [review_to_url(element) for element in df['title']])
df.insert(3,column = 'selftext_urls',value = [review_to_url(element) for element in df['selftext']])

CPU times: user 826 ms, sys: 63.8 ms, total: 890 ms
Wall time: 978 ms


### Now we work on our data

### Create a subreddit column

In [15]:
# Create a target variable of 0/1 in dataframe
df['Subreddit_name'] = [1 if element == 'datascience' else 0 for element in df['subreddit']]

In [17]:
# Create a merged text field of clean_titles, clean_subtext and clean_urls
df['merged'] = df['clean_title'] + df['clean_selftext'] + df['selftext_urls'] + df['title_urls']

In [18]:
df.head()

Unnamed: 0,title,selftext,clean_title,selftext_urls,title_urls,clean_selftext,created_utc,num_comments,num_crossposts,score,subreddit,Subreddit_name,merged
0,[D] Hinton responds to Schmidhuber,,hinton responds schmidhuber,,,,1587609168,0,0,1,MachineLearning,0,hinton responds schmidhuber
1,Hinton responds to Schmidhuber,,hinton responds schmidhuber,,,,1587609111,1,0,1,MachineLearning,0,hinton responds schmidhuber
2,"[D] Other than vectorization, what other aspec...",I'm helping a friend design a course with dual...,vectorization aspects code optimization teach ...,,,helping friend design course dual intent entry...,1587606108,2,0,1,MachineLearning,0,vectorization aspects code optimization teach ...
3,Survey for IT Employees working from home! Hel...,,survey employees working home help us finding ...,,,,1587604741,2,0,1,MachineLearning,0,survey employees working home help us finding ...
4,[R] Chip Placement with Deep Reinforcement Lea...,,r chip placement deep reinforcement learning,,,,1587604558,1,0,1,MachineLearning,0,r chip placement deep reinforcement learning


In [19]:
df.to_csv('./data/clean.csv', index = False)