# Download Kaggle Dataset using Kaggle API

This Jupyter Notebook demonstrates how to download a dataset from Kaggle using the Kaggle API. Make sure you have your Kaggle API credentials ready before running this code. You can find your Kaggle API credentials by logging into your Kaggle account and going to the "Account" tab of your user profile (https://www.kaggle.com/your-username/account).

The Data collection, cleaning, formatting and storage steps are the first step of the data science workflow:

1. Data collection, cleaning, formatting and storage   <----------------- We are here
2. Data transformation and feature engineering
3. Statistical modeling and machine learning
4. Visualization and presentation

In [59]:
from kaggle.api.kaggle_api_extended import KaggleApi
from dotenv import load_dotenv
import os

# Initialize the Kaggle API client
api = KaggleApi()

# Get environment variables
load_dotenv()

# Replace with your Kaggle username and key
api.authenticate()

# Dataset name from Kaggle (e.g., 'username/dataset-name')
dataset_name = 'mariosfish/default-of-credit-card-clients'

# Destination path where you want to save the dataset
destination_path = '../data/raw_data'

# Create the destination directory if it doesn't exist
os.makedirs(destination_path, exist_ok=True)

# Download the dataset
api.dataset_download_files(dataset_name, path=destination_path, unzip=True)


# Load data from Kaggle into a pandas DataFrame

In [60]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('../data/raw_data/default of credit card clients.csv')

# Explore the dataset (optional)
print("Dataset Preview:")
print(data.head())

Dataset Preview:
   ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_1  PAY_2  PAY_3  PAY_4  \
0   1      20000    2          2         1   24      2      2     -1     -1   
1   2     120000    2          2         2   26     -1      2      0      0   
2   3      90000    2          2         2   34      0      0      0      0   
3   4      50000    2          2         1   37      0      0      0      0   
4   5      50000    1          2         1   57     -1      0     -1      0   

   ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
0  ...          0          0          0         0       689         0   
1  ...       3272       3455       3261         0      1000      1000   
2  ...      14331      14948      15549      1518      1500      1000   
3  ...      28314      28959      29547      2000      2019      1200   
4  ...      20940      19146      19131      2000     36681     10000   

   PAY_AMT4  PAY_AMT5  PAY_AMT6  dpnm  
0         0         0        

In [61]:

# Data Cleaning

# Remove duplicate rows (if any)
count_before = len(data)
data.drop_duplicates(inplace=True)
count_after = len(data)
print(f"Number of duplicate rows dropped: {count_before - count_after}")

# Remove rows that are all missing values (if any)
count_before = len(data)
data.dropna(how='all', inplace=True)
count_after = len(data)
print(f"Number of rows with missing all values dropped: {count_before - count_after}")

# Remove columns that are all missing values (if any)
count_before = len(data.columns)
data.dropna(axis=1, how='all', inplace=True)
count_after = len(data.columns)
print(f"Number of columns with missing all values dropped: {count_before - count_after}")

# Decide on a strategy to handle missing values (e.g., imputation, removal, etc.)
do_imputation = True
while do_imputation:
    print(f"Which strategy do you want for the missing values?")
    print(f"1. Imputation")
    print(f"2. Removal")
    print(f"3. Interpolation")
    print(f"4. None")
    choice = input("Enter your choice: ")
    if choice in ['1', '2', '3', '4']:
        choice = int(choice)
        do_imputation = False
    else:
        print("Invalid choice. Please try again.")
switch = {
    1: 'imputation',
    2: 'removal',
    3: 'interpolation',
    4: 'none'
}
print(f"Using {switch[choice]}.")
match  switch:
    case 1:
        # Impute missing values using the mean of the column
        data.fillna(data.mean(), inplace=True)
    case 2:
        # Remove rows with missing values
        data.dropna(inplace=True)
    case 3:
        # Interpolate missing values using the mean of the column
        data.interpolate(inplace=True)
    case 4:
        # Do nothing
        pass


Number of duplicate rows dropped: 0
Number of rows with missing all values dropped: 0
Number of columns with missing all values dropped: 0
Which strategy do you want for the missing values?
1. Imputation
2. Removal
3. Interpolation
4. None
Using none.


In [62]:
# Save the cleaned dataset

destination_path = '../data/processed_data'
os.makedirs(destination_path, exist_ok=True)
data.to_csv(os.path.join(destination_path, 'processed_data.csv'), index=False)