# Data Type Converter


In [3]:
from typing import Union
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import category_encoders as ce


### Categorical Values


In [4]:
def label_encoding(dataframe: pd.DataFrame, column: Union[str, int]):
    """Convert categorical variables into numerical values"""
    
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    df_copy = dataframe.copy()
    encoder = LabelEncoder()
    df_copy[column] = encoder.fit_transform(df_copy[column])
    return df_copy


In [5]:
def one_hot_encoding(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    df_copy = dataframe.copy()
    one_hot_encoded = pd.get_dummies(df_copy[column], prefix=column)
    df_copy = df_copy.drop(column, axis=1)
    df_copy = df_copy.join(one_hot_encoded)
    return df_copy


### Numerical Convertion


In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, PowerTransformer


In [7]:
def standardize_data(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(dataframe[column])
    return pd.DataFrame(scaled_data, columns=dataframe.columns)


In [8]:
def normalize_data(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(dataframe[column])
    return pd.DataFrame(scaled_data, columns=dataframe.columns)


In [9]:
def normalize_vectors(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    norm = np.linalg.norm(dataframe[column], axis=1)
    normalized_data = dataframe[column].div(norm, axis=0)
    return pd.DataFrame(normalized_data, columns=dataframe.columns)


### String Convertion


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [12]:
def compute_tfidf(dataframe: pd.DataFrame, column: Union[str, int]) -> pd.DataFrame:
    """Compute TF-IDF vectors for a list of text documents."""
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    assert pd.api.types.is_string_dtype(dataframe[column]), f"Column '{column} is not string type'"
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(dataframe[column])
    tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=dataframe.index)

    return tfidf_df


### Test


In [10]:
df = pd.read_csv("dataset/movies.csv")
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
label_encoding(df, 'Genre')


Unnamed: 0,Title,Director,Genre,Release Date,Duration,Rating
0,Key entire popular.,Anthony Becker,6,1981-05-12,102,6.8
1,Gun husband reveal.,William Johnson,3,2016-06-13,92,7.6
2,Crime cover.,Amy Le,4,1988-03-22,144,5.5
3,Challenge.,Andrea Martinez,7,2013-04-01,161,2.0
4,Close study.,Michael Rodgers,5,2012-10-18,177,3.7
...,...,...,...,...,...,...
29995,Daughter.,Richard Nelson,7,2007-03-12,177,8.0
29996,Simply.,Jeffrey Hatfield,5,2011-08-16,126,5.7
29997,Also authority nor.,Ryan Brown,0,1998-05-07,73,4.9
29998,Total report upon.,Melissa Stephenson,2,2008-06-06,145,6.9
