# Normalización

In [2]:
import pandas as pd
import numpy as np

In [3]:
def sentiment_processing(x):
    resul = np.nan
    if x == "NEGATIVE":
        resul = 0
    elif x == "POSITIVE":
        resul = 1
    return resul

### Cargamos el dataset

In [4]:
df = pd.read_csv("Datasets/reddit_database_sentiment.csv", 
                 sep=";", 
                 parse_dates=["created_date"],
                 usecols=["created_date", "subreddit", "title", "author", "full_link", "score", "post", "sentiment"],
                 converters={"sentiment": lambda x: sentiment_processing(x)},
                 index_col="created_date", 
                 low_memory=False)

In [5]:
print(f"El archivo ocupa: {df.memory_usage(deep=True).sum()} Bytes")

El archivo ocupa: 386445993 Bytes


In [6]:
df

Unnamed: 0_level_0,subreddit,title,author,full_link,score,post,sentiment
created_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-02-11 19:47:22,analytics,So what do you guys all do related to analytic...,xtom,https://www.reddit.com/r/analytics/comments/b0...,7.0,There's a lot of reasons to want to know all t...,0.0
2010-03-04 20:17:26,analytics,"Google's Invasive, non-Anonymized Ad Targeting...",xtom,https://www.reddit.com/r/analytics/comments/b9...,2.0,"I'm cross posting this from /r/cyberlaw, hopef...",0.0
2011-01-06 04:51:18,analytics,"DotCed - Functional Web Analytics - Tagging, R...",dotced,https://www.reddit.com/r/analytics/comments/ew...,1.0,"DotCed,a Functional Analytics Consultant, offe...",0.0
2011-01-19 11:45:30,analytics,Program Details - Data Analytics Course,iqrconsulting,https://www.reddit.com/r/analytics/comments/f5...,0.0,Here is the program details of the data analyt...,0.0
2011-01-19 21:52:28,analytics,potential job in web analytics... need to anal...,therewontberiots,https://www.reddit.com/r/analytics/comments/f5...,2.0,i decided grad school (physics) was not for me...,1.0
...,...,...,...,...,...,...,...
2022-05-07 21:38:52,rstats,Help interpretting lmer model output,seeking-stillness,https://www.reddit.com/r/rstats/comments/ukjiy...,1.0,Hello! I am wonder how the following output wo...,0.0
2022-05-07 22:13:52,rstats,Medical stats book with R,Sweaty_Catch_4275,https://www.reddit.com/r/rstats/comments/ukk7u...,1.0,Can anybody recommend me a book with medical s...,1.0
2022-05-08 00:38:50,rstats,Markov chains with unequal sequence lengths,sebelly,https://www.reddit.com/r/rstats/comments/ukn1i...,1.0,I'm trying to build a simple Markov chain. I h...,0.0
2022-05-08 01:19:00,rstats,view all available Rcpp::plugins,BOBOLIU,https://www.reddit.com/r/rstats/comments/uknuh...,1.0,How do I view all available Rcpp::plugins? Tha...,1.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 274239 entries, 2010-02-11 19:47:22 to 2022-05-08 01:19:34
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   subreddit  274209 non-null  object 
 1   title      274209 non-null  object 
 2   author     274209 non-null  object 
 3   full_link  274209 non-null  object 
 4   score      274209 non-null  float64
 5   post       274209 non-null  object 
 6   sentiment  274203 non-null  float64
dtypes: float64(2), object(5)
memory usage: 16.7+ MB


### Tratamiento de valores nulos y no correspondidos

#### Hay valores string y nulos en el índice

In [8]:
df.index = pd.to_datetime(df.index, errors='coerce')
df.drop(df[df.index.isna()].index, inplace=True)

In [9]:
df.sort_index(inplace=True)

In [10]:
df

Unnamed: 0_level_0,subreddit,title,author,full_link,score,post,sentiment
created_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-06-23 16:50:44,artificial,Are worms intelligent?,ithkuil,https://www.reddit.com/r/artificial/comments/8...,3.0,This is me trying to start thinking about my o...,0.0
2009-07-01 23:59:58,statistics,Is a masters in Statistics worth it?,nazghash,https://www.reddit.com/r/statistics/comments/8...,8.0,I'm working on an MS in Statistics at a state ...,0.0
2009-07-05 07:39:04,statistics,Books/resources that discuss change point anal...,Abhishek_Ghose,https://www.reddit.com/r/statistics/comments/8...,2.0,I am looking for books/online-resources that d...,1.0
2009-07-07 03:58:15,statistics,Pari-Mutuel Horse Racing Pool?,painperdu,https://www.reddit.com/r/statistics/comments/8...,0.0,Anyone study angles on how to beat a pari-mutu...,0.0
2009-07-21 16:55:04,statistics,Ask Stats: I got a BS in stats and I'm startin...,mathsuu,https://www.reddit.com/r/statistics/comments/9...,0.0,I should mention that I know SAS and R pretty ...,0.0
...,...,...,...,...,...,...,...
2022-05-08 18:00:10,MachineLearning,[D] Simple Questions Thread,AutoModerator,https://www.reddit.com/r/MachineLearning/comme...,1.0,Please post your questions here instead of cre...,0.0
2022-05-08 18:13:53,computerscience,Question about binary code and files identity,pc0999,https://www.reddit.com/r/computerscience/comme...,1.0,"Hi, \n\nI am a philosophy student writing an e...",0.0
2022-05-08 18:15:56,dataengineering,Creating a data pipeline in AWS,OinkOink9,https://www.reddit.com/r/dataengineering/comme...,1.0,I want to create a data pipeline in AWS (free-...,0.0
2022-05-08 19:01:40,dataengineering,I have a free pass to MLOps World in Toronto C...,Quiet_Basket_9699,https://www.reddit.com/r/dataengineering/comme...,1.0,You can see workshops etc www.MLOps world.com,1.0


In [11]:
df.isna().sum()

subreddit    0
title        0
author       0
full_link    0
score        0
post         0
sentiment    6
dtype: int64

In [12]:
df.drop(df[df.sentiment.isna()].index, inplace=True)

### Tratamiento de tipos

In [12]:
df.dtypes

subreddit     object
title         object
author        object
full_link     object
score        float64
post          object
sentiment    float64
dtype: object

In [13]:
df

Unnamed: 0_level_0,subreddit,title,author,full_link,score,post,sentiment
created_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-06-23 16:50:44,artificial,Are worms intelligent?,ithkuil,https://www.reddit.com/r/artificial/comments/8...,3.0,This is me trying to start thinking about my o...,0.0
2009-07-01 23:59:58,statistics,Is a masters in Statistics worth it?,nazghash,https://www.reddit.com/r/statistics/comments/8...,8.0,I'm working on an MS in Statistics at a state ...,0.0
2009-07-05 07:39:04,statistics,Books/resources that discuss change point anal...,Abhishek_Ghose,https://www.reddit.com/r/statistics/comments/8...,2.0,I am looking for books/online-resources that d...,1.0
2009-07-07 03:58:15,statistics,Pari-Mutuel Horse Racing Pool?,painperdu,https://www.reddit.com/r/statistics/comments/8...,0.0,Anyone study angles on how to beat a pari-mutu...,0.0
2009-07-21 16:55:04,statistics,Ask Stats: I got a BS in stats and I'm startin...,mathsuu,https://www.reddit.com/r/statistics/comments/9...,0.0,I should mention that I know SAS and R pretty ...,0.0
...,...,...,...,...,...,...,...
2022-05-08 18:00:10,MachineLearning,[D] Simple Questions Thread,AutoModerator,https://www.reddit.com/r/MachineLearning/comme...,1.0,Please post your questions here instead of cre...,0.0
2022-05-08 18:13:53,computerscience,Question about binary code and files identity,pc0999,https://www.reddit.com/r/computerscience/comme...,1.0,"Hi, \n\nI am a philosophy student writing an e...",0.0
2022-05-08 18:15:56,dataengineering,Creating a data pipeline in AWS,OinkOink9,https://www.reddit.com/r/dataengineering/comme...,1.0,I want to create a data pipeline in AWS (free-...,0.0
2022-05-08 19:01:40,dataengineering,I have a free pass to MLOps World in Toronto C...,Quiet_Basket_9699,https://www.reddit.com/r/dataengineering/comme...,1.0,You can see workshops etc www.MLOps world.com,1.0


#### Valores numericos

In [14]:
number_cols = df.select_dtypes(include="number").columns

In [15]:
df[number_cols] = df[number_cols].apply(pd.to_numeric, downcast="unsigned")


In [16]:
object_cols = df.select_dtypes(include="object").columns
df[object_cols] = df[object_cols].convert_dtypes()

In [17]:
df["subreddit"] = df["subreddit"].astype("category")

In [18]:
df.dtypes

subreddit          category
title        string[python]
author       string[python]
full_link    string[python]
score                uint16
post         string[python]
sentiment             uint8
dtype: object

In [19]:
print(f"El archivo ocupa: {df.memory_usage(deep=True).sum()} Bytes")

El archivo ocupa: 349751009 Bytes


In [20]:
import nltk
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import emoji
import contractions

In [None]:
def preprocess_post(text: str) -> str:

    # Convertimos a minúsculas
    text = str(text).lower()

    # Convertimos emojis a texto
    text = emoji.demojize(text)

    # Expandimos contracciones
    text = contractions.fix(text)

    # Eliminamos URLs
    text = re.sub(r'((http|https)\:\/\/)?([a-zA-Z0-9\.\-]+)\.([a-zA-Z]{2,})(\/[a-zA-Z0-9\#\?\&\=\.\_\-]*)*', '', text)

    # Eliminamos caracteres especiales y dígitos
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenizamos y eliminamos stopwords
    words = word_tokenize(text)

    stop_words = stopwords.words('english')
    words = [word for word in words if word not in stop_words and len(word) > 1]

    # Lemmatizamos

    lemmatizer = WordNetLemmatizer()

    words = [lemmatizer.lemmatize(word) for word in words]

    return " ". join(words)



In [22]:
from tqdm import tqdm

In [None]:
preprocess_post = np.vectorize(preprocess_post)
df["clean_post"] = tqdm(preprocess_post(df.post))

100%|██████████| 274203/274203 [00:58<00:00, 4715.21it/s]


In [24]:
df.to_csv("processed_df.csv", sep=";")