# Evidencia Final: Herramientas Computacionales
## Miembros del equipo:
- Vera Sofía Acevedo Gómez  A01747156
- Yael Michel García López  A01750911

In [1]:
# Importar librerías
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Leer el archivo
df = pd.read_csv("unis.csv")

## Exploración Inicial

Hallazgos:
- 2341 registros iniciales
- 12 columnas iniciales
- "No of student" declarada como object (debería ser numérico)
- "International Student" declarada como object (debería ser numérico)
- "OverAll Score" declarada como object (debería ser numérico)
- "Female:Male Ratio" seclarada como object (debería separar los valores y declararlos numéricos)
- Universidades sin "Rank"
- Registros con valores nulos

In [62]:
# Obtención de información general
df.info()

# 2341 registros iniciales
# 12 columnas iniciales

<class 'pandas.core.frame.DataFrame'>
Index: 1556 entries, 0 to 1696
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   University Rank                   1556 non-null   int64  
 1   Name of University                1556 non-null   object 
 2   Location                          1556 non-null   object 
 3   No of student                     1556 non-null   int64  
 4   No of student per staff           1556 non-null   float64
 5   International Student Percentage  1555 non-null   float64
 6   OverAll Score                     1556 non-null   object 
 7   Teaching Score                    1556 non-null   float64
 8   Research Score                    1556 non-null   float64
 9   Citations Score                   1556 non-null   float64
 10  Industry Income Score             1556 non-null   float64
 11  International Outlook Score       1556 non-null   float64
 12  Female Stud

In [63]:
df.head(1)

# "No of student" declarada como object (debería ser numérico)
# "International Student" declarada como object (debería ser numérico)
# "OverAll Score" declarada como object (debería ser numérico)
# "Female:Male Ratio" seclarada como object (debería separar los valores y declararlos numéricos)
# Duplicidad de registros

Unnamed: 0,University Rank,Name of University,Location,No of student,No of student per staff,International Student Percentage,OverAll Score,Teaching Score,Research Score,Citations Score,Industry Income Score,International Outlook Score,Female Students Percentage,Male Students Percentage
0,1,University of Oxford,United Kingdom,20965,10.6,42.0,96.4,92.3,99.7,99.0,74.9,96.2,48.0,52.0


In [9]:
# Verificar duplicados
df[df.duplicated()]

# Duplicidad de registros

Unnamed: 0,University Rank,Name of University,Location,No of student,No of student per staff,International Student,Female:Male Ratio,OverAll Score,Teaching Score,Research Score,Citations Score,Industry Income Score,International Outlook Score


## Limpieza de datos

In [8]:
# Eliminar duplicados
df = df.drop_duplicates()

In [28]:
# Verificar duplicidad en "Name of University"
df["Name of University"].describe()

# No hay duplicados

count                                    2233
unique                                   2233
top       Xi’an Jiaotong-Liverpool University
freq                                        1
Name: Name of University, dtype: object

In [57]:
# Verificar nulos
df.isna().sum()

University Rank                0
Name of University             0
Location                       0
No of student                  0
No of student per staff        0
International Student          0
OverAll Score                  0
Teaching Score                 0
Research Score                 0
Citations Score                0
Industry Income Score          0
International Outlook Score    0
Female Students Percentage     0
Male Students Percentage       0
dtype: int64

In [12]:
# Rellenar "University Rank" en orden numérico
df['University Rank'] = range(1, len(df) + 1)

In [37]:
# Eliminar registros nulos de "Name of University" ya que es una variable importante
df = df.dropna(subset=['Name of University'])

In [44]:
# Eliminar registros nulos de "Location" ya que es una variable importante
df = df.dropna(subset=['Location'])

In [46]:
# Eliminar registros nulos de "OverAll Score" ya que es una variable importante
df = df.dropna(subset=['OverAll Score'])

In [49]:
# División de "Female:Male Ratio"
# 1. Dividir la columna en dos y asignar los resultados a nuevas columnas 'Female Students Percentage' y 'Male Students Percentage'
df[['Female Students Percentage', 'Male Students Percentage']] = df['Female:Male Ratio'].str.split(':', expand=True)

# 2. Convertir las nuevas columnas a tipo numérico
df['Female Students Percentage'] = pd.to_numeric(df['Female Students Percentage'], errors='coerce')
df['Male Students Percentage'] = pd.to_numeric(df['Male Students Percentage'], errors='coerce')

# 3. Eliminar columna "Female:Male Ratio"
df = df.drop(columns=["Female:Male Ratio"])

## Analisis de estadística descriptiva para rellenar nulos

In [None]:
df["Female Students Percentage"].plot.box()


In [None]:
df["Female Students Percentage"].plot.hist()


In [None]:
df["Female Students Percentage"].plot.kde()


In [None]:
df["Male Students Percentage"].plot.box()

In [None]:
df["Male Students Percentage"].plot.hist()

In [None]:
df["Male Students Percentage"].plot.kde()

In [56]:
# Dado que hay muchos valores atípicos y se encuentran muy alejados del centro, se optó por usar la mediana

df = df.fillna({
    "Female Students Percentage": df["Female Students Percentage"].median(),
    "Male Students Percentage": df["Male Students Percentage"].median()
})

In [None]:
# Verificar nulos
df.isna().sum()

## Conversión de tipo de dato (object --> numérico)

In [59]:
# "No of student" (object --> numérico)
df["No of student"] = pd.to_numeric(df["No of student"].str.replace(",", ""))

In [61]:
# "International Student" (object --> numérico)

# 1. Renombrar columna
df.rename(columns={'International Student': 'International Student Percentage'}, inplace=True)

# 2. Pasar a numérico
df['International Student Percentage'] = pd.to_numeric(df['International Student Percentage'].str.replace('%', ''))

In [64]:
# "OverAll Score" (object --> numérico)

# Función para convertir a numérico
def convert_overall_score(score):
    if '–' in score:  # Si hay un rango
        low, high = map(float, score.split('–'))  # Separa los valores y convierte a float
        return (low + high) / 2  # Devuelve el promedio
    return float(score)  # Si es un solo valor, lo convierte a float

# Aplicar la función a "OverAll Score" y reemplazar los valores
df['OverAll Score'] = df['OverAll Score'].apply(convert_overall_score)

# Conviertir la columna a tipo numérico
df['OverAll Score'] = pd.to_numeric(df['OverAll Score'])

## Creación de archivo csv limpio

In [65]:
# Guardamos un nuevo archivo con los cambios hechos
df.to_csv("unis_clean.csv", index=False)