<a href="https://colab.research.google.com/github/Danelly-y-Nicole/Industria-del-K-pop/blob/main/K_pop_Modulo_de_Funciones.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modulos de las funciones

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

def extract_data(file_path):
    data = pd.read_csv(file_path)
    return data

def check_missing_ratio(data):
    data_na = (data.isnull().sum() / len(data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)[:30]
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    return missing_data

def handle_duplicates(data):
    duplicate_rows_data = data[data.duplicated()]
    return duplicate_rows_data

def preprocess_dates(data):
    data['Debut'] = data['Debut'].replace('0/01/1900', pd.NA)
    data['Date of Birth'] = pd.to_datetime(data['Date of Birth'], format='%d/%m/%Y')
    data['Debut'] = pd.to_datetime(data['Debut'], format='%d/%m/%Y')

    data['age'] = (datetime.now() - data['Date of Birth']).astype('<m8[Y]')
    data['Debut Age'] = (data['Debut'] - data['Date of Birth']).astype('<m8[Y]')

    data['year'], data['month'], data['day'] = data['Date of Birth'].apply(lambda x:x.year), data['Date of Birth'].apply(lambda x:x.month), data['Date of Birth'].apply(lambda x:x.day)

    return data

def map_gender_to_numeric(data):
    gender_mapping = {'M': 1, 'F': 0}
    data['Gender_numeric'] = data['Gender'].map(gender_mapping)
    return data

def save_preprocessed_data(data, output_file):
    data.to_csv(output_file, index=False)

# Load data
file_path = 'kpopidolsv3.csv'
data = extract_data(file_path)

# Data preprocessing
missing_data = check_missing_ratio(data)
duplicate_rows_data = handle_duplicates(data)
data_preprocessed = preprocess_dates(data)

# Map Gender values to numeric values
data_preprocessed = map_gender_to_numeric(data_preprocessed)

# Save preprocessed data
output_file = 'kpopidols_preprocessed.csv'
save_preprocessed_data(data_preprocessed, output_file)




### Verificando si se creo el archivo

In [7]:
import os

# Ruta y nombre de archivo del CSV preprocesado
output_file = 'kpopidols_preprocessed.csv'

# Verificar si el archivo existe
if os.path.exists(output_file):
    print(f"El archivo '{output_file}' se creó correctamente.")
else:
    print(f"El archivo '{output_file}' no se creó o no se encuentra en la ubicación especificada.")


El archivo 'kpopidols_preprocessed.csv' se creó correctamente.


In [8]:
data = pd.read_csv('kpopidols_preprocessed.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1778 entries, 0 to 1777
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Stage Name      1778 non-null   object 
 1   Full Name       1769 non-null   object 
 2   Korean Name     1768 non-null   object 
 3   K Stage Name    1777 non-null   object 
 4   Date of Birth   1776 non-null   object 
 5   Group           1632 non-null   object 
 6   Debut           1625 non-null   object 
 7   Company         1632 non-null   object 
 8   Country         1778 non-null   object 
 9   Second Country  62 non-null     object 
 10  Height          836 non-null    float64
 11  Weight          566 non-null    float64
 12  Birthplace      834 non-null    object 
 13  Other Group     140 non-null    object 
 14  Former Group    264 non-null    object 
 15  Gender          1778 non-null   object 
 16  age             1776 non-null   float64
 17  Debut Age       1623 non-null   f

## Proceso de ML

### Clasificación de Género de los Idols

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Carga los datos
data = pd.read_csv('kpopidols_preprocessed.csv')

def prepare_data(data):
    selected_columns = ['Height', 'Weight', 'age', 'Debut Age', 'year', 'month', 'day', 'Gender_numeric']
    data_selected = data[selected_columns]
    return data_selected

def impute_missing_values(data):
    imputer = SimpleImputer(strategy='mean')
    data_imputed = imputer.fit_transform(data)
    return data_imputed

def train_logistic_regression(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Prepara los datos
prepared_data = prepare_data(data)

# Manejo de valores nulos (imputación)
imputed_data = impute_missing_values(prepared_data)

# Divide los datos en características (X) y etiquetas (y)
X = imputed_data[:, :-1]
y = imputed_data[:, -1]

# Divide los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrena el modelo de regresión logística
classification_model = train_logistic_regression(X_train, y_train)

# Evalúa el modelo
accuracy = evaluate_model(classification_model, X_test, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.7162921348314607


### Predicción de la Estatura de los Idols

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

data = pd.read_csv('kpopidols_preprocessed.csv')

def preprocess_data(data):
    # Elimina columnas no relevantes
    data = data.drop(['Stage Name', 'Full Name', 'Korean Name', 'K Stage Name', 'Gender'], axis=1)

    # Codificación de variables categóricas
    categorical_columns = ['Group', 'Company', 'Country', 'Second Country', 'Birthplace', 'Other Group', 'Former Group']
    encoder = OneHotEncoder(drop='first', sparse=False)
    encoded_columns = pd.DataFrame(encoder.fit_transform(data[categorical_columns]))
    encoded_columns.columns = encoder.get_feature_names_out(categorical_columns)
    data = pd.concat([data.drop(categorical_columns, axis=1), encoded_columns], axis=1)

    # Transformación de fechas
    data['Date of Birth'] = pd.to_datetime(data['Date of Birth'])
    data['Debut'] = pd.to_datetime(data['Debut'])
    reference_date = pd.to_datetime('2023-01-01')
    data['Age_at_Debut'] = (data['Debut'] - data['Date of Birth']).dt.days
    data = data.drop(['Date of Birth', 'Debut'], axis=1)

    # Elimina filas con valores nulos
    data = data.dropna()

    return data

def train_model(X_train, y_train):
    # Entrena el modelo de RandomForestRegressor
    regression_model = RandomForestRegressor(random_state=42)
    regression_model.fit(X_train, y_train)
    return regression_model

def evaluate_model(model, X_test, y_test):
    # Evalúa el modelo
    mse = mean_squared_error(y_test, model.predict(X_test))
    return mse


data = preprocess_data(data)

# División de características (X) y etiquetas (y)
X = data.drop('Height', axis=1)
y = data['Height']

# División de datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = train_model(X_train, y_train)

mse = evaluate_model(model, X_test, y_test)
print("Mean Squared Error:", mse)


Mean Squared Error: 11.774937864077655
