In [5]:
import sqlite3
import pandas as pd
import os


# Loading an SQLite Database (Table "loans")

In this code block, the data from the CSV file was efficiently loaded into the SQLite database using chunked reading to handle the 2 million records without overloading memory. Each chunk of 50,000 rows is read with pandas.read_csv and converted into an SQL table using to_sql, automatically creating the "loans" table in the database if it didn't already exist and adding the records in each iteration. This way, all the information was stored in a structured way in SQLite, preserving all the columns of the original dataset and allowing subsequent queries without needing to load the entire file into memory.

In [6]:
conn = sqlite3.connect("/workspaces/final_project_creditscoring/Data/credit_scoring.db")
cursor = conn.cursor()

In [None]:


chunksize = 50000  # Ajusta según memoria disponible

for chunk in pd.read_csv("/workspaces/final_project_creditscoring/Data/accepted_2007_to_2018Q4.csv", chunksize=chunksize, parse_dates=['issue_d']):
     
    # Guardar en SQLite, creando la tabla automáticamente con todas las columnas
    chunk.to_sql("loans", conn, if_exists='append', index=False)


  for chunk in pd.read_csv("/workspaces/final_project_creditscoring/Data/accepted_2007_to_2018Q4.csv", chunksize=chunksize, parse_dates=['issue_d']):
  for chunk in pd.read_csv("/workspaces/final_project_creditscoring/Data/accepted_2007_to_2018Q4.csv", chunksize=chunksize, parse_dates=['issue_d']):
  for chunk in pd.read_csv("/workspaces/final_project_creditscoring/Data/accepted_2007_to_2018Q4.csv", chunksize=chunksize, parse_dates=['issue_d']):
  for chunk in pd.read_csv("/workspaces/final_project_creditscoring/Data/accepted_2007_to_2018Q4.csv", chunksize=chunksize, parse_dates=['issue_d']):
  for chunk in pd.read_csv("/workspaces/final_project_creditscoring/Data/accepted_2007_to_2018Q4.csv", chunksize=chunksize, parse_dates=['issue_d']):
  for chunk in pd.read_csv("/workspaces/final_project_creditscoring/Data/accepted_2007_to_2018Q4.csv", chunksize=chunksize, parse_dates=['issue_d']):
  for chunk in pd.read_csv("/workspaces/final_project_creditscoring/Data/accepted_2007_to_2018Q4.csv

The following code block aims to create a balanced sample dataset by month from the entire loans database. First, the records for each month are counted using SQLite to identify months with at least a minimum number of records (MIN_RECISTROS_PER_MES). Then, the number of records to take from each month is calculated so that the final dataset has approximately the same total as TOTAL_RECISTROS. Next, a random sample is taken by month directly from the database, ensuring that each selected month has the same number of records and that months with little data are excluded. Finally, all the fragments are concatenated into a single DataFrame df_final, the dates are converted to datetime format, and the shape of the dataset and the number of unique months are verified. This allows working with a more manageable and representative dataset of all months with sufficient data, without loading the original 2 million records into memory.


In [8]:

TOTAL_REGISTROS = 200_000
MIN_REGISTROS_POR_MES = 1000

# 1️⃣ Obtener número de registros por mes
monthly_counts = pd.read_sql("""
    SELECT
        strftime('%Y-%m', issue_d) AS year_month,
        COUNT(*) AS n_registros
    FROM loans
    GROUP BY year_month
""", conn)

# 2️⃣ Filtrar meses con suficientes datos
valid_months = monthly_counts[
    monthly_counts['n_registros'] >= MIN_REGISTROS_POR_MES
]['year_month'].tolist()

# 3️⃣ Calcular cuántos registros tomar por mes
rows_per_month = TOTAL_REGISTROS // len(valid_months)

print(f"Meses válidos: {len(valid_months)}")
print(f"Registros por mes: {rows_per_month}")

# 4️⃣ Muestreo balanceado por mes desde SQLite
df_list = []

for m in valid_months:
    query = f"""
    SELECT *
    FROM loans
    WHERE strftime('%Y-%m', issue_d) = '{m}'
    ORDER BY RANDOM()
    LIMIT {rows_per_month}
    """
    df_month = pd.read_sql_query(query, conn)
    df_list.append(df_month)

# 5️⃣ Dataset final
df_final = pd.concat(df_list, ignore_index=True)

# 6️⃣ Conversión de fecha
df_final['issue_d'] = pd.to_datetime(df_final['issue_d'])


# 7️⃣ Verificación
print("Shape final:", df_final.shape)
print("Meses únicos:", df_final['issue_d'].dt.to_period('M').nunique())


Meses válidos: 103
Registros por mes: 1941
Shape final: (192309, 151)
Meses únicos: 103


We confirm if there are missing months and if the data is continuous.


In [9]:
# Obtener meses únicos ordenados
meses = (
    df_final['issue_d']
    .dt.to_period('M')
    .sort_values()
    .unique()
)

# Convertir a índice temporal
meses = pd.PeriodIndex(meses, freq='M')

# Crear rango completo esperado
rango_completo = pd.period_range(
    start=meses.min(),
    end=meses.max(),
    freq='M'
)

# Detectar meses faltantes
meses_faltantes = rango_completo.difference(meses)

# Resultado
if len(meses_faltantes) == 0:
    print("✅ No missing months. The series is continuous.")
else:
    print("⚠️ There are months with no data:")
    print(meses_faltantes)


✅ No missing months. The series is continuous.


# Save a table called 'main_table' in our credit_scoring.db file.
Finally, we save the previously created dataset in a table.


In [None]:

df_final.to_sql("main_table", conn, if_exists="replace", index=False)

conn.close()

: 