# Data Type Converter


In [None]:
from typing import Union
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import category_encoders as ce


### Categorical Values


In [None]:
def label_encoding(dataframe: pd.DataFrame, column: Union[str, int]):
    """Convert categorical variables into numerical values"""
    
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    df_copy = dataframe.copy()
    encoder = LabelEncoder()
    df_copy[column] = encoder.fit_transform(df_copy[column])
    return df_copy


In [None]:
def one_hot_encoding(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    df_copy = dataframe.copy()
    one_hot_encoded = pd.get_dummies(df_copy[column], prefix=column)
    df_copy = df_copy.drop(column, axis=1)
    df_copy = df_copy.join(one_hot_encoded)
    return df_copy


### Numerical Convertion


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, PowerTransformer


In [None]:
def standardize_data(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(dataframe[column])
    return pd.DataFrame(scaled_data, columns=dataframe.columns)


In [None]:
def normalize_data(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(dataframe[column])
    return pd.DataFrame(scaled_data, columns=dataframe.columns)


In [None]:
def normalize_vectors(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    norm = np.linalg.norm(dataframe[column], axis=1)
    normalized_data = dataframe[column].div(norm, axis=0)
    return pd.DataFrame(normalized_data, columns=dataframe.columns)


### String Convertion


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
def compute_tfidf(dataframe: pd.DataFrame, column: Union[str, int]) -> pd.DataFrame:
    """Compute TF-IDF vectors for a list of text documents."""
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    assert pd.api.types.is_string_dtype(dataframe[column]), f"Column '{column} is not string type'"
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(dataframe[column])
    tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=dataframe.index)

    return tfidf_df


### Test


In [None]:
df = pd.read_csv("dataset/movies.csv")
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
label_encoding(df, 'Genre')
