<h3> Kaggle: Data Cleaning </h3>

In [None]:
# Downloading all needed libraries
!pip3 install numpy
!pip3 install pandas
!pip3 install scipy
!pip3 install seaborn
!pip3 install matplotlib
!pip3 install datetime
!pip3 install preprocessing
!pip3 install fuzzywuzzy

Chapter 1: Handling Missing Values

In [None]:
# Libraries
import numpy as np
import pandas as pd
 
df = pd.read_csv("file_name") # Read csv file

# Dataframe dimension
df.head()                                         # Return first 5 rows of dataframe
df.tail()                                         # Return last 5 rows of dataframe
df.shape()                                        # Return dimension of dataframe
np.product(df.shape)                              # Return number of cells in dataframe

# Clearing null values
df.isnull().sum()                                 # Return number of missing data per column
df.isnull().sum().sum()                           # Return total number of missing data
df.dropna(axis = 0)                               # Drop rows containing missing value
df.dropna(axis = 1)                               # Drop columns containing missing value
df.fillna(...)                                    # Fill NaN cells with chosen value
df.fillna(method = "bfill", axis = ...)           # Fill NaN cells with value before them
df.fillna(method = "bfill", axos = ...).fillna(0) # Fill NaN cells with value before them, then rest of NaN cells to be 0

Chapter 2: Scaling & Normalization

In [None]:
# Libraries
from scipy import stats
from sklearn.preprocessing import minmax_scale

# Scaling: Change range of dataset
# Normalization: Change distribution of dataset
minmax_scale(df[:...]) # Scales data between 0 and 1
stats.boxcox(df)       # Normalizes data

Chapter 3: Parsing Dates

In [None]:
# Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import datetime

# General
df["col_name"].dtype                         # Return data type of column: datetime64 is date dtype, 'O' for object dtype
sns.displot(df, ...)                         # Return histogram of dataframe

# Parsing dates
pd.to_datetime(df["col_name"], format = ...) # Change 'O' dtype --> datetime64 dtype
df["parsed_date"].dt.day                     # Select day of the month

Chapter 4: Character Encodings

In [None]:
# Libraries
import charset_normalizer

# Use the same encoder for decode (encoder could be "ascii", "utf-8", "Windows-1252")
type(df) # Check datatype
df.encode("encoder", errors = "replace") # Change string dtype --> byte dtype
df.decode("encoder")                     # Change byte --> string

# Return confidence of encoder for text in stdev
with open("...csv", 'rb') as rawdata:
    result = charset_normalizer.detect(rawdata.read(10000)) 

Chapter 5: Inconsistent Data Entry

In [None]:
# Libraries
import numpy as np
import pandas as pd
import fuzzywuzzy
from fuzzywuzzy import process
import charset_normalizer

# General
df["col_name"].unique # Return array of unique values of the chosen column
df.sort()             # Sort in increasing order

# String modification
df.str.lower()        # Change string to lowercase
df.str.strip()        # Remove trailing whitespaces

# Extract similar terms of criteria word
fuzzywuzzy.process.extract("criteria", "col_name", limit = ...,
                           scorer = fuzzywuzzy.fuzz.token_sort_ratio)

# Function to replace rows in the column that match the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    strings = df[column].unique()
    matches = fuzzywuzzy.process.extract(string_to_match, strings, limit = 10, scorer = fuzzywuzzy.fuzz.token_sort_ratio)
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]
    rows_with_matches = df[column].isin(close_matches)
    df.loc[rows_with_matches, column] = string_to_match