# ETL Users Reviews

In [69]:
import json
import os
import pandas as pd
import gzip
import ast
import numpy as np
import re
from datetime import datetime

When trying to load the file into a DF I encountered the error:
<br>

**JSONDecodeError**: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)
<br>
After extensive searching and delving into Stack Overflow, I stumbled upon an article that was a breakthrough:<br>

[Convert JSON to pd.DataFrame](https://stackoverflow.com/questions/55338899/convert-json-to-pd-dataframe/65427497#65427497)

In [70]:
rows = []

# Open the gzip-compressed JSON file
with gzip.open(r'C:\Users\flore\OneDrive\Escritorio\Etapa Labs\MLOPs\01. PI MLOps - STEAM\user_reviews.json.gz', 'rb') as f:
    # Iterate over each line in the file
    for line in f.readlines():
        # Decode the line and evaluate it as a Python literal
        rows.append(ast.literal_eval(line.decode('utf-8')))
        
# Convert the list of dictionaries into a DataFrame
df_reviews = pd.DataFrame(rows)

# Display the first few rows of the DataFrame
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


We now can display the data inside the file and we see that we have 3 columns and the last column contains nested lists of dictionaries.

In [71]:
df_reviews.columns

Index(['user_id', 'user_url', 'reviews'], dtype='object')

In [72]:
df_reviews.shape

(25799, 3)

In [73]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


In [74]:
# Check for duplicate rows based on 'user_id' and 'user_url' columns

duplicated_rows = df_reviews[df_reviews.duplicated(subset=['user_id', 'user_url'], keep=False)]
duplicated_rows

Unnamed: 0,user_id,user_url,reviews
9,76561198156664158,http://steamcommunity.com/profiles/76561198156...,"[{'funny': '', 'posted': 'Posted June 16.', 'l..."
50,Rivtex,http://steamcommunity.com/id/Rivtex,"[{'funny': '', 'posted': 'Posted December 23, ..."
83,76561198094224872,http://steamcommunity.com/profiles/76561198094...,[]
119,DieMadchenschanderin,http://steamcommunity.com/id/DieMadchenschanderin,"[{'funny': '', 'posted': 'Posted August 29, 20..."
147,relesprit,http://steamcommunity.com/id/relesprit,"[{'funny': '', 'posted': 'Posted December 27, ..."
...,...,...,...
17819,76561198076474887,http://steamcommunity.com/profiles/76561198076...,"[{'funny': '', 'posted': 'Posted April 12.', '..."
17916,yolofaceguy,http://steamcommunity.com/id/yolofaceguy,"[{'funny': '', 'posted': 'Posted October 31, 2..."
18028,76561198075591109,http://steamcommunity.com/profiles/76561198075...,"[{'funny': '', 'posted': 'Posted December 26, ..."
18234,76561198092022514,http://steamcommunity.com/profiles/76561198092...,"[{'funny': '', 'posted': 'Posted July 3.', 'la..."


In [75]:
# Sort the DataFrame by 'user_id'
df_sorted = df_reviews.sort_values(by='user_id')

# Find duplicated rows based on 'user_id' and 'user_url' columns
duplicated_rows = df_sorted[df_sorted.duplicated(subset=['user_id', 'user_url'], keep=False)]
duplicated_rows

Unnamed: 0,user_id,user_url,reviews
12888,05041129,http://steamcommunity.com/id/05041129,"[{'funny': '', 'posted': 'Posted May 18, 2015...."
5250,05041129,http://steamcommunity.com/id/05041129,"[{'funny': '', 'posted': 'Posted May 18, 2015...."
3134,111222333444555666888,http://steamcommunity.com/id/11122233344455566...,"[{'funny': '', 'posted': 'Posted December 22, ..."
3133,111222333444555666888,http://steamcommunity.com/id/11122233344455566...,"[{'funny': '', 'posted': 'Posted December 22, ..."
4138,29123,http://steamcommunity.com/id/29123,"[{'funny': '', 'posted': 'Posted March 26.', '..."
...,...,...,...
2721,xXAussieRockXx,http://steamcommunity.com/id/xXAussieRockXx,"[{'funny': '', 'posted': 'Posted July 17, 2015..."
2680,yolofaceguy,http://steamcommunity.com/id/yolofaceguy,"[{'funny': '', 'posted': 'Posted October 31, 2..."
17916,yolofaceguy,http://steamcommunity.com/id/yolofaceguy,"[{'funny': '', 'posted': 'Posted October 31, 2..."
5855,zeroblade,http://steamcommunity.com/id/zeroblade,"[{'funny': '', 'posted': 'Posted November 30, ..."


### Comparing one against the other we can validate that the records are indeed duplicated. So we drop the duplicated value.

In [76]:
# Drop duplicated rows based on 'user_id' and 'user_url' columns, keeping the first occurrence
df_reviews = df_reviews.drop_duplicates(subset=['user_id', 'user_url'], keep='first')

# Display the DataFrame after dropping duplicates
df_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


## Now we can proceed to investigate our reviews column.
This column has the information from all of the reviews posted by the user identified with the unique user_id. 

### We display an example row for the third column in order to understand what information is of value and how to proceed to extract it. This information is the information that is the input of our sentiment analysis, so it crucial. 

In [77]:
df_reviews.iloc[1,2]

[{'funny': '',
  'posted': 'Posted June 24, 2014.',
  'last_edited': '',
  'item_id': '251610',
  'helpful': '15 of 20 people (75%) found this review helpful',
  'recommend': True,
  'review': 'I know what you think when you see this title "Barbie Dreamhouse Party" but do not be intimidated by it\'s title, this is easily one of my GOTYs. You don\'t get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can\'t 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what true fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8'},
 {'funny': '',
  'posted': 'Posted September 8, 2013.',
  'last_edited': '',
  'item_id': '227300',
  'helpful': '0 of 1 people (0%) found this review helpful',
  'recommend': True,
  'review': "For a simple (it's actually not all th

In [78]:
df_reviews.iloc[100,2]

[{'funny': '',
  'posted': 'Posted October 13, 2014.',
  'last_edited': '',
  'item_id': '209870',
  'helpful': '3 of 8 people (38%) found this review helpful',
  'recommend': True,
  'review': 'Its a very fun game i recomend as its nearly like TITANFALL but its FREE!Play this game now'}]

### We need to extract into columns the information present inside each field from the third column. <br> 
The fields into columns will be: funny, posted, last_edited,item_id,helpful,recommend, review.

In [79]:
# 'reviews' is the column containing the list of dictionaries
# need to work with the values in the column review to create the sentiment
# Initialize an empty list to store the unnested reviews
unnested_reviews = []

# Iterate through each row in the 'df_reviews' DataFrame
for index, row in df_reviews.iterrows():
    user_id = row['user_id']
    user_url = row['user_url']
    reviews = row['reviews']
    
    # Iterate through each review in the 'reviews' list
    for review in reviews:
        new_review = {
            'user_id': user_id,
            'user_url': user_url,
            'funny': review.get('funny', ''),
            'posted': review.get('posted', ''),
            'last_edited': review.get('last_edited', ''),
            'item_id': review.get('item_id', ''),
            'helpful': review.get('helpful', ''),
            'recommend': review.get('recommend', ''),
            'review_text': review.get('review', '')  # Renamed 'review' to 'review_text'
        }
        
        # Append the new review to the list
        unnested_reviews.append(new_review)

# Create a new DataFrame 'df_reviews_unnested' from the list of unnested reviews
df_reviews_unnested = pd.DataFrame(unnested_reviews)

In [80]:
df_reviews_unnested

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review_text
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...
58425,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
58426,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
58427,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
58428,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,Posted July 20.,,730,No ratings yet,True,:D


In [81]:
df_reviews_unnested.columns

Index(['user_id', 'user_url', 'funny', 'posted', 'last_edited', 'item_id',
       'helpful', 'recommend', 'review_text'],
      dtype='object')

In [82]:
df_reviews = df_reviews_unnested

In [83]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58430 entries, 0 to 58429
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      58430 non-null  object
 1   user_url     58430 non-null  object
 2   funny        58430 non-null  object
 3   posted       58430 non-null  object
 4   last_edited  58430 non-null  object
 5   item_id      58430 non-null  object
 6   helpful      58430 non-null  object
 7   recommend    58430 non-null  bool  
 8   review_text  58430 non-null  object
dtypes: bool(1), object(8)
memory usage: 3.6+ MB


## We check for missing values and duplicates
Many columns have ' ' as values, that should be converted into None from the unnested transformation.

In [84]:
# Make a copy of the DataFrame
df_reviews_copy = df_reviews.copy()

# Replace empty strings with None for all columns in the copy
df_reviews_copy.replace('', np.nan, inplace=True)

# Assign the modified copy back to the original DataFrame
df_reviews = df_reviews_copy

In [85]:
# Count the number of None values in each column
none_count = df_reviews.isnull().sum()
# Calculate the percentage of None values for each column
none_percentage = (none_count / len(df_reviews)) * 100
# Combine the None count and percentage into a DataFrame
none_info = pd.DataFrame({'None Count': none_count, 'None Percentage': none_percentage})
# Add a new column for the total number of rows in the DataFrame
none_info['Total Registers'] = len(df_reviews)
# Reorder columns
none_info = none_info[['Total Registers', 'None Count', 'None Percentage']]
none_info

Unnamed: 0,Total Registers,None Count,None Percentage
user_id,58430,0,0.0
user_url,58430,0,0.0
funny,58430,50420,86.291289
posted,58430,0,0.0
last_edited,58430,52393,89.667979
item_id,58430,0,0.0
helpful,58430,0,0.0
recommend,58430,0,0.0
review_text,58430,30,0.051343


### Given this insight into the data of the reviews, we can proceed to eliminate those two columns that are mostly None values: last_edited and funny

In [86]:
df_reviews['last_edited'].value_counts()

last_edited
Last edited November 25, 2013.    99
Last edited October 17, 2015.     18
Last edited July 25, 2015.        17
Last edited June 22, 2015.        16
Last edited December 29, 2015.    16
                                  ..
Last edited August 13, 2014.       1
Last edited February 26, 2014.     1
Last edited November 30, 2014.     1
Last edited February 28, 2014.     1
Last edited August 15, 2014.       1
Name: count, Length: 1014, dtype: int64

In [87]:
df_reviews['funny'].value_counts()

funny
1 person found this review funny        5083
2 people found this review funny        1213
3 people found this review funny         488
4 people found this review funny         263
5 people found this review funny         162
                                        ... 
58 people found this review funny          1
405 people found this review funny         1
105 people found this review funny         1
1,130 people found this review funny       1
825 people found this review funny         1
Name: count, Length: 185, dtype: int64

In [88]:
df_reviews.drop(['last_edited', 'funny','user_url'], axis=1, inplace= True)

In [89]:
df_reviews

Unnamed: 0,user_id,user_url,posted,item_id,helpful,recommend,review_text
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted July 15, 2011.",22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted April 21, 2011.",43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,"Posted September 8, 2013.",227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...
58425,76561198312638244,http://steamcommunity.com/profiles/76561198312...,Posted July 10.,70,No ratings yet,True,a must have classic from steam definitely wort...
58426,76561198312638244,http://steamcommunity.com/profiles/76561198312...,Posted July 8.,362890,No ratings yet,True,this game is a perfect remake of the original ...
58427,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,Posted July 3.,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
58428,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,Posted July 20.,730,No ratings yet,True,:D


### We can also drop those registers where review_text is None

In [90]:
# Drop rows with None values in the 'review_text' column
df_reviews = df_reviews.dropna(subset=['review_text'])

# Display the DataFrame after dropping rows with None values in the 'review_text' column
df_reviews

Unnamed: 0,user_id,user_url,posted,item_id,helpful,recommend,review_text
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted July 15, 2011.",22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted April 21, 2011.",43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,"Posted September 8, 2013.",227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...
58425,76561198312638244,http://steamcommunity.com/profiles/76561198312...,Posted July 10.,70,No ratings yet,True,a must have classic from steam definitely wort...
58426,76561198312638244,http://steamcommunity.com/profiles/76561198312...,Posted July 8.,362890,No ratings yet,True,this game is a perfect remake of the original ...
58427,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,Posted July 3.,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
58428,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,Posted July 20.,730,No ratings yet,True,:D


In [91]:
# Count the number of None values in each column
none_count = df_reviews.isnull().sum()
# Calculate the percentage of None values for each column
none_percentage = (none_count / len(df_reviews)) * 100
# Combine the None count and percentage into a DataFrame
none_info = pd.DataFrame({'None Count': none_count, 'None Percentage': none_percentage})
# Add a new column for the total number of rows in the DataFrame
none_info['Total Registers'] = len(df_reviews)
# Reorder columns
none_info = none_info[['Total Registers', 'None Count', 'None Percentage']]
none_info

Unnamed: 0,Total Registers,None Count,None Percentage
user_id,58400,0,0.0
user_url,58400,0,0.0
posted,58400,0,0.0
item_id,58400,0,0.0
helpful,58400,0,0.0
recommend,58400,0,0.0
review_text,58400,0,0.0


### Posted 
This column has the date when the review to the game was posted by the user.<br>
But the format is 'Posted <Month> <day>, <year>' <br>
And we need to transform this into yyyy-mm-dd format.<br>
We are going to extract the month and store it in a 'month' column and 'day', and 'year'.<br>
Since in the column there is not a single format but several:<br>
Example: <br>
-   'Posted November 5, 2011.'<br>
-   'Posted July 2.'<br>
These last, have the missing year, so they will be defined as None Valid format<br>

In [92]:
df_reviews

Unnamed: 0,user_id,user_url,posted,item_id,helpful,recommend,review_text
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted July 15, 2011.",22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted April 21, 2011.",43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,"Posted September 8, 2013.",227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...
58425,76561198312638244,http://steamcommunity.com/profiles/76561198312...,Posted July 10.,70,No ratings yet,True,a must have classic from steam definitely wort...
58426,76561198312638244,http://steamcommunity.com/profiles/76561198312...,Posted July 8.,362890,No ratings yet,True,this game is a perfect remake of the original ...
58427,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,Posted July 3.,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
58428,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,Posted July 20.,730,No ratings yet,True,:D


In [93]:
# Define the conversion function
def convert_posted(posted_str):
    match = re.search(r"Posted (\w+) (\d{1,2})(?:,)?(?: (\d{4}))?", posted_str)
    if match:
        month_str, day_str, year_str = match.groups()
        month_mapping = {
            "January": "01",
            "February": "02",
            "March": "03",
            "April": "04",
            "May": "05",
            "June": "06",
            "July": "07",
            "August": "08",
            "September": "09",
            "October": "10",
            "November": "11",
            "December": "12"
        }
        current_year = str(pd.Timestamp.now().year)
        formatted_date = f"{year_str or current_year}-{month_mapping[month_str]}-{day_str.zfill(2)}"
        return formatted_date
    else:
        return None

In [94]:
df_reviews_copy = df_reviews.copy()
# Apply the conversion function to parse 'posted' into datetime format
df_reviews_copy['date'] = df_reviews_copy['posted'].apply(convert_posted)

# Convert 'date' column to datetime format
df_reviews_copy['date'] = pd.to_datetime(df_reviews_copy['date'], errors='coerce')

# Extract year _copyfrom the 'date' column
df_reviews_copy['year'] = df_reviews_copy['date'].dt.year

df_reviews_copy

Unnamed: 0,user_id,user_url,posted,item_id,helpful,recommend,review_text,date,year
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011-11-05,2011
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted July 15, 2011.",22200,No ratings yet,True,It's unique and worth a playthrough.,2011-07-15,2011
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted April 21, 2011.",43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,2011-04-21,2011
3,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014-06-24,2014
4,js41637,http://steamcommunity.com/id/js41637,"Posted September 8, 2013.",227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,2013-09-08,2013
...,...,...,...,...,...,...,...,...,...
58425,76561198312638244,http://steamcommunity.com/profiles/76561198312...,Posted July 10.,70,No ratings yet,True,a must have classic from steam definitely wort...,2024-07-10,2024
58426,76561198312638244,http://steamcommunity.com/profiles/76561198312...,Posted July 8.,362890,No ratings yet,True,this game is a perfect remake of the original ...,2024-07-08,2024
58427,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,Posted July 3.,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,2024-07-03,2024
58428,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,Posted July 20.,730,No ratings yet,True,:D,2024-07-20,2024


In [95]:
df_reviews = df_reviews_copy

### Now we have a date column, but this column has the current year imput for those registers that were faulty in posted that were created with a different format and the year was missing. So now, this registers that are 9929 registers can be filtered by the year column. This registers should not be considered for the best_developer_year function 

In [96]:
df_reviews['year'].value_counts()

year
2014    21821
2015    18146
2024     9929
2013     6707
2012     1201
2011      530
2010       66
Name: count, dtype: int64

In [98]:
# Count NaN values in the 'year' column
nan_count = df_reviews['year'].isna().sum()
nan_count

0

In [99]:
df_reviews.drop(['posted'], axis=1, inplace= True)

In [100]:
# Count NaN values in the 'user_id' column
nan_count = df_reviews['user_id'].isna().sum()
nan_count

0

In [101]:
df_reviews["item_id"] = pd.to_numeric(df_reviews["item_id"], errors="coerce")

In [102]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58400 entries, 0 to 58429
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      58400 non-null  object        
 1   user_url     58400 non-null  object        
 2   item_id      58400 non-null  int64         
 3   helpful      58400 non-null  object        
 4   recommend    58400 non-null  bool          
 5   review_text  58400 non-null  object        
 6   date         58400 non-null  datetime64[ns]
 7   year         58400 non-null  int32         
dtypes: bool(1), datetime64[ns](1), int32(1), int64(1), object(4)
memory usage: 3.4+ MB


### Store reviews Dataframe 
Now that we have done the load and transformation of the data into a valueable information we store it to proceed with the EDA.
We choose to store the data as .parquet beacuse of the size limitations<br>

In [103]:
# Define the file path for storing the Parquet file
reviews = 'data/reviews.parquet'

# Store the DataFrame as a Parquet file
df_reviews.to_parquet(reviews, index=False)

# Print a message confirming the storage location
print(f'reviews DataFrame was stored into {reviews}')

reviews DataFrame was stored into data/reviews.parquet
