In [50]:
import numpy as np
import pandas as pd
import matplotlib as mp

In [51]:
#1. Carga el dataset y muestra las primeras 10 filas

data = pd.read_csv("1102TA_01_dataset.csv")

data.head(10)

Unnamed: 0.1,Unnamed: 0,id_vendedor,nombre,edad,ventas_mensuales,clientes,satisfaccion,region
0,0,1,Carlos,54.0,6099.41,95.0,1.0,Este
1,1,2,Elena,51.0,6308.44,56.0,3.0,Norte
2,2,3,María,58.0,7046.99,68.0,1.0,Este
3,3,4,Elena,44.0,4961.2,63.0,3.0,Norte
4,4,5,Elena,31.0,5993.78,,,Norte
5,5,6,Luis,,6211.87,47.0,1.0,Sur
6,6,7,María,57.0,4403.77,98.0,4.0,Norte
7,7,8,María,55.0,5465.41,25.0,3.0,Sur
8,8,9,María,52.0,5790.45,80.0,4.0,Oeste
9,9,10,Elena,31.0,3909.01,39.0,,Oeste


In [52]:
#2. Analiza cuántos valores nulos y duplicados contiene

data = pd.read_csv("1102TA_01_dataset.csv")
data = data.iloc[:, 1:] #esto elimina el index que crea conflicto para los duplicados

nulos = data.isnull().sum().sum()
duplicados = data.duplicated().sum()

print("Nulos:", nulos)
print("Duplicados:", duplicados)

Nulos: 103
Duplicados: 15


In [53]:
#3. Elimina los duplicados, conservando solo la primera ocurrencia

data = pd.read_csv("1102TA_01_dataset.csv", index_col=0)
data.reset_index(drop=True, inplace=True)

In [58]:
data.drop_duplicates(inplace=True)

In [59]:
# Número de filas totales
print("Filas totales:", len(data))

# Número de duplicados
print("Duplicados restantes:", data.duplicated().sum())

Filas totales: 500
Duplicados restantes: 0


In [60]:
#4. Imputa los valores nulos de forma adecuada
#Edad: media / Ventas: mediana / Clientes: mediana / Satisfacción: moda

data["edad"].fillna(data["edad"].mean(), inplace=True)

data["ventas_mensuales"].fillna(data["ventas_mensuales"].median(), inplace=True)

data["clientes"].fillna(data["clientes"].median(), inplace=True)

data["satisfaccion"].fillna(data["satisfaccion"].mode()[0], inplace=True)

print(data)

     id_vendedor  nombre  edad  ventas_mensuales  clientes  satisfaccion  \
0              1  Carlos  54.0           6099.41      95.0           1.0   
1              2   Elena  51.0           6308.44      56.0           3.0   
2              3   María  58.0           7046.99      68.0           1.0   
3              4   Elena  44.0           4961.20      63.0           3.0   
4              5   Elena  31.0           5993.78      66.0           4.0   
..           ...     ...   ...               ...       ...           ...   
495          496  Carlos  35.0           3488.10      21.0           3.0   
496          497    Luis  27.0           3444.20      30.0           1.0   
497          498    Luis  27.0           3184.89      69.0           4.0   
498          499   Elena  24.0           3035.93      50.0           3.0   
499          500  Carlos  28.0           6446.83      75.0           1.0   

    region  
0     Este  
1    Norte  
2     Este  
3    Norte  
4    Norte  
..     ..

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["edad"].fillna(data["edad"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["ventas_mensuales"].fillna(data["ventas_mensuales"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

In [61]:
#5. Detecta los outliers en la columna ventas_mensuales con el método IQR y elimínalos

# Calcular Q1 y Q3
Q1, Q3 = data["ventas_mensuales"].quantile(0.25), data["ventas_mensuales"].quantile(0.75)
IQR = Q3 - Q1

# Definir límites
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

# Filtrar datos dentro de los límites
data = data[(data["ventas_mensuales"] >= lower) & (data["ventas_mensuales"] <= upper)]

data = data[(data["ventas_mensuales"] >= lower) & (data["ventas_mensuales"] <= upper)]

data

Unnamed: 0,id_vendedor,nombre,edad,ventas_mensuales,clientes,satisfaccion,region
0,1,Carlos,54.0,6099.41,95.0,1.0,Este
1,2,Elena,51.0,6308.44,56.0,3.0,Norte
2,3,María,58.0,7046.99,68.0,1.0,Este
3,4,Elena,44.0,4961.20,63.0,3.0,Norte
4,5,Elena,31.0,5993.78,66.0,4.0,Norte
...,...,...,...,...,...,...,...
495,496,Carlos,35.0,3488.10,21.0,3.0,Sur
496,497,Luis,27.0,3444.20,30.0,1.0,Oeste
497,498,Luis,27.0,3184.89,69.0,4.0,Norte
498,499,Elena,24.0,3035.93,50.0,3.0,Sur


In [64]:
#6. Aplica normalización MinMax a las columnas edad, ventas_mensuales y clientes

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

cols = ["edad", "ventas_mensuales", "clientes"]
data[cols] = scaler.fit_transform(data[cols])
data

Unnamed: 0,id_vendedor,nombre,edad,ventas_mensuales,clientes,satisfaccion,region
0,1,Carlos,0.864865,0.652256,0.757576,1.0,Este
1,2,Elena,0.783784,0.684479,0.363636,3.0,Norte
2,3,María,0.972973,0.798329,0.484848,1.0,Este
3,4,Elena,0.594595,0.476798,0.434343,3.0,Norte
4,5,Elena,0.243243,0.635973,0.464646,4.0,Norte
...,...,...,...,...,...,...,...
495,496,Carlos,0.351351,0.249714,0.010101,3.0,Sur
496,497,Luis,0.135135,0.242947,0.101010,1.0,Oeste
497,498,Luis,0.135135,0.202974,0.494949,4.0,Norte
498,499,Elena,0.054054,0.180011,0.303030,3.0,Sur


In [68]:
#7. Muestra un resumen del DataFrame limpio y normalizado

data

Unnamed: 0,id_vendedor,nombre,edad,ventas_mensuales,clientes,satisfaccion,region
0,1,Carlos,0.864865,0.652256,0.757576,1.0,Este
1,2,Elena,0.783784,0.684479,0.363636,3.0,Norte
2,3,María,0.972973,0.798329,0.484848,1.0,Este
3,4,Elena,0.594595,0.476798,0.434343,3.0,Norte
4,5,Elena,0.243243,0.635973,0.464646,4.0,Norte
...,...,...,...,...,...,...,...
495,496,Carlos,0.351351,0.249714,0.010101,3.0,Sur
496,497,Luis,0.135135,0.242947,0.101010,1.0,Oeste
497,498,Luis,0.135135,0.202974,0.494949,4.0,Norte
498,499,Elena,0.054054,0.180011,0.303030,3.0,Sur
