## Cargando datos

In [16]:
import pandas as pd

data = pd.read_csv('competition_data.csv')
submission = pd.read_csv('submission.csv')

In [17]:
data

Unnamed: 0.1,Unnamed: 0,ts,username,platform,conn_country,user_agent_decrypted,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,reason_start,shuffle,TARGET
0,110163,2018-03-11T05:05:45Z,11145402699,"iOS 11.0 (iPhone8,1)",AR,,Crazy For U,Big Time Rush,24/seven,spotify:track:3jFfr89lnSmb4QBtfG8JBP,clickrow,False,False
1,66026,2023-06-05T10:42:33Z,11145402699,ios,AR,unknown,Nada Personal - Remasterizado 2007,Soda Stereo,Me Verás Volver (Hits & Más),spotify:track:09TTeexnlKewZdjOak2sV2,trackdone,True,False
2,116790,2018-07-01T02:04:51Z,11145402699,"iOS 11.0 (iPhone8,1)",AR,unknown,Good Times,CHIC,Risqué,spotify:track:0G3fbPbE1vGeABDEZF0jeG,trackdone,True,True
3,18431,2019-09-08T04:58:07Z,11145402699,"iOS 12.4 (iPhone8,1)",AR,unknown,Verano del 92,Los Piojos,3er Arco,spotify:track:1NXvuBAq48QrxRFQZVmORQ,trackdone,False,True
4,82941,2017-07-13T18:13:52Z,11145402699,"iOS 11.0 (iPhone8,1)",AR,,Simpatico,Ekko Park,Simpatico,spotify:track:2gYJY0sIx1ErgTIha2nPRg,trackdone,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
100139,78756,2017-04-06T20:31:14Z,11145402699,"iOS 10.2.1 (iPad6,8,1)",AR,,Losing My Religion,R.E.M.,In Time: The Best Of R.E.M. 1988-2003,spotify:track:12axV6NUqaYH3yFUWwArzr,trackdone,True,False
100140,12585,2021-11-03T21:06:07Z,11145402699,"iOS 15.1 (iPhone12,3)",AR,unknown,Cerca De La Revolucion,Charly García,Piano Bar,spotify:track:66grIvFLGrI4tNhggO2DAd,trackdone,True,False
100141,93960,2022-01-14T01:13:42Z,11145402699,"iOS 15.2 (iPhone12,3)",AR,unknown,A Los Jóvenes De Ayer - Remastered 2012,Serú Girán,Bicicleta,spotify:track:6YAl320SfxLO4rVIZTFKqG,trackdone,True,False
100142,74339,2024-05-01T17:51:16Z,11145402699,ios,AR,,heat not hot,Serengeti,heat not hot,spotify:track:0rKLO1hXpmoIthQgJKoczN,trackdone,True,False


## Preprocesamiento

In [29]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Copia del dataframe
df_clean = data.copy()

# 1. Eliminar columnas innecesarias
cols_to_drop = [
    'user_agent_decrypted',
    'master_metadata_track_name',
    'master_metadata_album_artist_name',
    'master_metadata_album_album_name',
    'spotify_track_uri'
]
df_clean.drop(columns=cols_to_drop, inplace=True)

# 2. Procesar timestamp: extraer hora, día de la semana, mes
df_clean['ts'] = pd.to_datetime(df_clean['ts'], errors='coerce')
df_clean['hour'] = df_clean['ts'].dt.hour
df_clean['weekday'] = df_clean['ts'].dt.weekday
df_clean['month'] = df_clean['ts'].dt.month
df_clean.drop(columns=['ts'], inplace=True)

# 3. Simplificar 'platform': extraer solo sistema operativo (tipo 'iOS', 'Android', etc.)
df_clean['platform'] = df_clean['platform'].str.extract(r'(^\w+)', expand=False).str.lower()

# 4. Codificar variables categóricas con LabelEncoder
cat_columns = ['platform', 'conn_country', 'reason_start']
label_encoders = {}

for col in cat_columns:
    le = LabelEncoder()
    df_clean[col] = df_clean[col].fillna('missing')
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# 5. Convertir booleanos a enteros
df_clean['shuffle'] = df_clean['shuffle'].astype(int)
df_clean['TARGET'] = df_clean['TARGET'].astype(int)

# Mostrar el dataframe limpio
df_clean.head()

Unnamed: 0.1,Unnamed: 0,username,platform,conn_country,reason_start,shuffle,TARGET,hour,weekday,month
0,110163,11145402699,0,0,2,0,0,5,6,3
1,66026,11145402699,0,0,7,1,0,10,0,6
2,116790,11145402699,0,0,7,1,1,2,6,7
3,18431,11145402699,0,0,7,0,1,4,6,9
4,82941,11145402699,0,0,7,1,0,18,3,7


In [19]:
# Copia del dataframe
submission_clean = submission.copy()

# 1. Eliminar columnas innecesarias
cols_to_drop = [
    'user_agent_decrypted',
    'master_metadata_track_name',
    'master_metadata_album_artist_name',
    'master_metadata_album_album_name',
    'spotify_track_uri'
]
submission_clean.drop(columns=cols_to_drop, inplace=True)

# 2. Procesar timestamp: extraer hora, día de la semana, mes
submission_clean['ts'] = pd.to_datetime(submission_clean['ts'], errors='coerce')
submission_clean['hour'] = submission_clean['ts'].dt.hour
submission_clean['weekday'] = submission_clean['ts'].dt.weekday
submission_clean['month'] = submission_clean['ts'].dt.month
submission_clean.drop(columns=['ts'], inplace=True)

# 3. Simplificar 'platform': extraer solo sistema operativo (tipo 'iOS', 'Android', etc.)
submission_clean['platform'] = submission_clean['platform'].str.extract(r'(^\w+)', expand=False).str.lower()

# 4. Codificar variables categóricas con LabelEncoder
cat_columns = ['platform', 'conn_country', 'reason_start']
label_encoders = {}

for col in cat_columns:
    le = LabelEncoder()
    submission_clean[col] = submission_clean[col].fillna('missing')
    submission_clean[col] = le.fit_transform(submission_clean[col])
    label_encoders[col] = le

# 5. Convertir booleanos a enteros
submission_clean['shuffle'] = submission_clean['shuffle'].astype(int)

# Mostrar el dataframe limpio
submission_clean.head()

Unnamed: 0.1,Unnamed: 0,username,platform,conn_country,reason_start,shuffle,hour,weekday,month
0,32933,11145402699,1,0,7,1,3,2,4
1,32713,11145402699,1,0,3,1,19,4,4
2,90799,11145402699,0,0,3,1,13,2,11
3,30990,11145402699,1,0,3,1,18,6,3
4,110120,11145402699,0,0,2,1,17,5,3


In [20]:
x_data = df_clean.drop(columns=['TARGET'])
y_data = df_clean['TARGET']

In [21]:
x_data

Unnamed: 0.1,Unnamed: 0,username,platform,conn_country,reason_start,shuffle,hour,weekday,month
0,110163,11145402699,0,0,2,0,5,6,3
1,66026,11145402699,0,0,7,1,10,0,6
2,116790,11145402699,0,0,7,1,2,6,7
3,18431,11145402699,0,0,7,0,4,6,9
4,82941,11145402699,0,0,7,1,18,3,7
...,...,...,...,...,...,...,...,...,...
100139,78756,11145402699,0,0,7,1,20,3,4
100140,12585,11145402699,0,0,7,1,21,2,11
100141,93960,11145402699,0,0,7,1,1,4,1
100142,74339,11145402699,0,0,7,1,17,2,5


## Sin Validation

### Regresión logística

In [22]:
from sklearn.linear_model import LogisticRegression

reg_log = LogisticRegression(multi_class = 'ovr', solver = 'liblinear')

In [23]:
reg_log.fit(x_data, y_data)



In [24]:
prediction = reg_log.predict_proba(submission_clean) # Umbral por defecto = 0.5.
proba_1 = prediction[:, 1] # Probabilidad de clase 1
proba_1

array([0.52676147, 0.52676147, 0.52676147, ..., 0.52676147, 0.52676147,
       0.52676147], shape=(25037,))

In [None]:
results = pd.DataFrame({'ID': submission_clean['Unnamed: 0'], 'TARGET': proba_1})
results.to_csv('submission_incial.csv', index=False)