# Project: Recommendation Systems - Your Future with Data


# Step 1: Import Essential Libraries 

In [1]:
# Libraries specialized "expert kits" to plug into Python essential for EDA.
import joblib
import requests
import os
import json
import pickle # -- Binary (unreadable by humans) -- Can save almost any Python object -- Very fast for complex objects --
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from numpy._core.defchararray import upper
from pathlib import Path
from tabulate import tabulate
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.feature_selection import f_classif, SelectKBest, f_regression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error, mean_squared_error, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import NearestNeighbors
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from sklearn.svm import SVC

# Step 2: Problem Statement / Data Collection
## 2.1 Description of the Problem
**Objectives**
- Explore census data.
- Build socioeconomic profiles.
- Analyze the importance and weight of social variables (education, gender, race, etc.) in economic predictions.
- Apply recommendation system techniques.
- Visualize and professionally communicate findings.

## 2.2 Inicitial Loading and Inspection

In [2]:
def setup_project_structure(base_path, subfolders):
    """Creates a standard data science directory tree."""
    for folder in subfolders:
        folder_path = Path(base_path) / folder
        folder_path.mkdir(parents=True, exist_ok=True)
    print(f"Project structure initialized in: {base_path}")

def download_dataset_to_raw(url, full_path):
    """Downloads a file to a specific path."""
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(full_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    file.write(chunk)
            
        print(f"Download successful! Saved to: {full_path}")
        return True
    except Exception as e:
        print(f"Download failed: {e}")
        return False

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the download: {e}")
        return False
    
# --- CONFIGURATION ---
BASE_DIR = "../data"
FOLDERS = ["raw", "processed", "interim"]
DATA_URL = "https://breathecode.herokuapp.com/asset/internal-link?id=2325&path=adult-census-income.csv"
TARGET_FILE = "adult-census-income.csv"

# EXECUTION
# Build the whole house
setup_project_structure(BASE_DIR, FOLDERS)

# Put the data in the 'raw' room
RAW_FILE_PATH = Path(BASE_DIR) / "raw" / TARGET_FILE
download_dataset_to_raw(DATA_URL, RAW_FILE_PATH)

Project structure initialized in: ../data
Download successful! Saved to: ../data/raw/adult-census-income.csv


True

In [3]:
# Define the full path again for reading
try:
    df = pd.read_csv(RAW_FILE_PATH, sep= None, engine= 'python')
    print("\nDataset loaded successfully into a DataFrame:")
except FileNotFoundError:
    print(f"\nError: File not found at {RAW_FILE_PATH}")


Dataset loaded successfully into a DataFrame:


# Step 3: Exploration and Data Cleaning

## 3.1 DataSet Dimension & Tipology Visibility

In [4]:
row, col = df.shape
print(f"The Dataset has {row} number of rows and {col} columns.")

The Dataset has 32561 number of rows and 15 columns.


## 3.2 Top 5 Row View

In [5]:
top_five_data = df.head()
print("These are the DataSet top 5 columns view to be analized:")
print(tabulate(top_five_data, headers='keys', tablefmt='psql'))

These are the DataSet top 5 columns view to be analized:
+----+-------+-------------+----------+--------------+-----------------+------------------+-------------------+----------------+--------+--------+----------------+----------------+------------------+------------------+----------+
|    |   age | workclass   |   fnlwgt | education    |   education.num | marital.status   | occupation        | relationship   | race   | sex    |   capital.gain |   capital.loss |   hours.per.week | native.country   | income   |
|----+-------+-------------+----------+--------------+-----------------+------------------+-------------------+----------------+--------+--------+----------------+----------------+------------------+------------------+----------|
|  0 |    90 | ?           |    77053 | HS-grad      |               9 | Widowed          | ?                 | Not-in-family  | White  | Female |              0 |           4356 |               40 | United-States    | <=50K    |
|  1 |    82 | Private 

In [6]:
# Rename columns if they have weird spacing/formatting
df.columns = [c.replace('-', '_').lower() for c in df.columns]
print(tabulate(df.head(), headers= "keys", tablefmt= "psql"))

+----+-------+-------------+----------+--------------+-----------------+------------------+-------------------+----------------+--------+--------+----------------+----------------+------------------+------------------+----------+
|    |   age | workclass   |   fnlwgt | education    |   education.num | marital.status   | occupation        | relationship   | race   | sex    |   capital.gain |   capital.loss |   hours.per.week | native.country   | income   |
|----+-------+-------------+----------+--------------+-----------------+------------------+-------------------+----------------+--------+--------+----------------+----------------+------------------+------------------+----------|
|  0 |    90 | ?           |    77053 | HS-grad      |               9 | Widowed          | ?                 | Not-in-family  | White  | Female |              0 |           4356 |               40 | United-States    | <=50K    |
|  1 |    82 | Private     |   132870 | HS-grad      |               9 | Widowed

## 3.3 Data Types and Non-Nulls Values Overview.

In [7]:
print("This is the information about Non-Null and Dtype:\n================================================")
print(df.info())

This is the information about Non-Null and Dtype:
<class 'pandas.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32561 non-null  int64
 1   workclass       32561 non-null  str  
 2   fnlwgt          32561 non-null  int64
 3   education       32561 non-null  str  
 4   education.num   32561 non-null  int64
 5   marital.status  32561 non-null  str  
 6   occupation      32561 non-null  str  
 7   relationship    32561 non-null  str  
 8   race            32561 non-null  str  
 9   sex             32561 non-null  str  
 10  capital.gain    32561 non-null  int64
 11  capital.loss    32561 non-null  int64
 12  hours.per.week  32561 non-null  int64
 13  native.country  32561 non-null  str  
 14  income          32561 non-null  str  
dtypes: int64(6), str(9)
memory usage: 3.7 MB
None


In [8]:
# 1. Handle missing values (marked as '?' in this dataset)
df = df.replace('?', np.nan).dropna()
print(tabulate(df.head(), headers= "keys", tablefmt= "psql"))

+----+-------+-------------+----------+--------------+-----------------+------------------+-------------------+----------------+--------+--------+----------------+----------------+------------------+------------------+----------+
|    |   age | workclass   |   fnlwgt | education    |   education.num | marital.status   | occupation        | relationship   | race   | sex    |   capital.gain |   capital.loss |   hours.per.week | native.country   | income   |
|----+-------+-------------+----------+--------------+-----------------+------------------+-------------------+----------------+--------+--------+----------------+----------------+------------------+------------------+----------|
|  1 |    82 | Private     |   132870 | HS-grad      |               9 | Widowed          | Exec-managerial   | Not-in-family  | White  | Female |              0 |           4356 |               18 | United-States    | <=50K    |
|  3 |    54 | Private     |   140359 | 7th-8th      |               4 | Divorce

In [9]:
df['age'].min()

np.int64(17)

## 3.4 Check Unique

In [10]:
print("These are the Unique Values for each columns on the Dataset:\n============================================================")
print(df.nunique())

These are the Unique Values for each columns on the Dataset:
age                  72
workclass             7
fnlwgt            20263
education            16
education.num        16
marital.status        7
occupation           14
relationship          6
race                  5
sex                   2
capital.gain        118
capital.loss         90
hours.per.week       94
native.country       41
income                2
dtype: int64


## 3.5 Check for Duplicates Values

In [11]:
# Since there are no identifiers, duplicate check looked at the entire row.
duplicate_val = df.duplicated().sum()
print(f"<< {duplicate_val} >> duplicated value in the dataset.")

<< 23 >> duplicated value in the dataset.


## 3.6 Duplicates Inspection

In [12]:
# Finding duplicates is a vital cleaning step.
duplicate_rows = df[df.duplicated(keep= False)]
print("Table with duplicates rows (in case they are and only for visibility):\n======================================================")
print(tabulate(duplicate_rows.head(24), headers= "keys", tablefmt= "psql"))

Table with duplicates rows (in case they are and only for visibility):
+-------+-------+------------------+----------+--------------+-----------------+--------------------+-------------------+----------------+--------------------+--------+----------------+----------------+------------------+------------------+----------+
|       |   age | workclass        |   fnlwgt | education    |   education.num | marital.status     | occupation        | relationship   | race               | sex    |   capital.gain |   capital.loss |   hours.per.week | native.country   | income   |
|-------+-------+------------------+----------+--------------+-----------------+--------------------+-------------------+----------------+--------------------+--------+----------------+----------------+------------------+------------------+----------|
|  6227 |    90 | Private          |    52386 | Some-college |              10 | Never-married      | Other-service     | Not-in-family  | Asian-Pac-Islander | Male   |     

## 3.7 Drop Duplicates

In [13]:
# 1. Count original rows
original_count = len(df)

# 2. Remove duplicates, keeping only the first occurrence
df = df.drop_duplicates()

# 3. Calculate how many were removed
new_count = len(df)
print(f"Removed {original_count - new_count} duplicate rows.")
print(f"Unique data remaining: {new_count}")

Removed 23 duplicate rows.
Unique data remaining: 30139


# Step 4. Defining the Recommendation Problem
- To answer the instruction's requirement, here is how I structure the logic:
    - **The User Profile**: A vector of [age, education.num, marital.status, sex, race, hours.per.week].
    - **The Recommendation Target**: I want to find the Occupation and Workclass of people who look like the user but make Income == 1.
    - **The Math**: I use Cosine Similarity. If User A is a 25-year-old high school grad, I find the 10 closest people in the dataset who earn >50K. I look at their jobs and say, "People like you who earn more usually work in [X] occupation."
    - **Categorical Columns**: These are the text columns that need encoding (become number data for the recommend algorithm)

In [14]:
# 1. THE LIST: Every text column that needs encoding
categorical_cols = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# (Run the Encoding/Transformation code on these columns here)

# 2. THE PROFILE: The specific "Search Criteria" for the recommender
# I pick a mix of numerical and encoded categorical columns
user_profile_features = ['age', 'education.num', 'hours.per.week', 'marital.status', 'sex']

# 3. THE TARGET: What I want the search to return
# I don't put these in the profile because we are LOOKING for them!
recommendation_targets = ['occupation', 'workclass']

## 4.1 Transform the Categorical Variables

In [15]:
# 1. ENCODE TARGETS (The things we want to recommend)
# We keep these as they are or label encode them to keep them simple
le_occ = LabelEncoder()
df['occ_label'] = le_occ.fit_transform(df['occupation'])
print(tabulate(df.head(), headers= "keys", tablefmt= "psql"))

+----+-------+-------------+----------+--------------+-----------------+------------------+-------------------+----------------+--------+--------+----------------+----------------+------------------+------------------+----------+-------------+
|    |   age | workclass   |   fnlwgt | education    |   education.num | marital.status   | occupation        | relationship   | race   | sex    |   capital.gain |   capital.loss |   hours.per.week | native.country   | income   |   occ_label |
|----+-------+-------------+----------+--------------+-----------------+------------------+-------------------+----------------+--------+--------+----------------+----------------+------------------+------------------+----------+-------------|
|  1 |    82 | Private     |   132870 | HS-grad      |               9 | Widowed          | Exec-managerial   | Not-in-family  | White  | Female |              0 |           4356 |               18 | United-States    | <=50K    |           3 |
|  3 |    54 | Private  

## 4.2 Vectorize User Profile

In [16]:
# VECTORIZE USER PROFILE (The traits used for matching)
# I use pd.get_dummies to turn categorical traits into a bit-vector (0s and 1s)
profile_categorical = ['sex', 'marital.status', 'race']
df_vectorized = pd.get_dummies(df, columns= profile_categorical)

# Simplify the Target (Income)
# Convert <=50K to 0 and >50K to 1
df_vectorized['income'] = df_vectorized['income'].apply(lambda x: 1 if '>' in str(x) else 0)
print(tabulate(df_vectorized.head(), headers= "keys", tablefmt= "psql"))

+----+-------+-------------+----------+--------------+-----------------+-------------------+----------------+----------------+----------------+------------------+------------------+----------+-------------+--------------+------------+---------------------------+------------------------------------+-------------------------------------+----------------------------------------+--------------------------------+----------------------------+--------------------------+---------------------------+---------------------------+--------------+--------------+--------------+
|    |   age | workclass   |   fnlwgt | education    |   education.num | occupation        | relationship   |   capital.gain |   capital.loss |   hours.per.week | native.country   |   income |   occ_label | sex_Female   | sex_Male   | marital.status_Divorced   | marital.status_Married-AF-spouse   | marital.status_Married-civ-spouse   | marital.status_Married-spouse-absent   | marital.status_Never-married   | marital.status_Sepa

### 4.2.1 Why I used pd.get_dummies?
* One-Hot Encoding (The Vector Logic)
    - By using pd.get_dummies, I give each category its own "dimension" in space. For example:
        - Never-married = [1, 0, 0]
        - Married = [0, 1, 0]
        - Divorced = [0, 0, 1]
    - The "distance" between any two different statuses is now exactly the same. This prevents the model from being biased by the arbitrary numbers assigned during factorization.

## 4.3 Normalize Numerical Data

In [17]:
# Let's normalize the data using StandardScaler
scaler = StandardScaler()
num_features = ['age', 'education.num', 'hours.per.week']
df_vectorized[num_features] = scaler.fit_transform(df_vectorized[num_features])
print(tabulate(df_vectorized.head(), headers= "keys", tablefmt= "psql"))

+----+------------+-------------+----------+--------------+-----------------+-------------------+----------------+----------------+----------------+------------------+------------------+----------+-------------+--------------+------------+---------------------------+------------------------------------+-------------------------------------+----------------------------------------+--------------------------------+----------------------------+--------------------------+---------------------------+---------------------------+--------------+--------------+--------------+
|    |        age | workclass   |   fnlwgt | education    |   education.num | occupation        | relationship   |   capital.gain |   capital.loss |   hours.per.week | native.country   |   income |   occ_label | sex_Female   | sex_Male   | marital.status_Divorced   | marital.status_Married-AF-spouse   | marital.status_Married-civ-spouse   | marital.status_Married-spouse-absent   | marital.status_Never-married   | marital.s

## 4.4 Define the User Profile Vector

In [18]:
# Create a list of all columns that now define the "User"
# This includes the numerical features + the newly created dummy columns
user_vector_cols = [col for col in df_vectorized.columns if any(x in col for x in profile_categorical) or col in num_features]

In [19]:
user_vector_cols

['age',
 'education.num',
 'hours.per.week',
 'sex_Female',
 'sex_Male',
 'marital.status_Divorced',
 'marital.status_Married-AF-spouse',
 'marital.status_Married-civ-spouse',
 'marital.status_Married-spouse-absent',
 'marital.status_Never-married',
 'marital.status_Separated',
 'marital.status_Widowed',
 'race_Amer-Indian-Eskimo',
 'race_Asian-Pac-Islander',
 'race_Black',
 'race_Other',
 'race_White']

## 4.5 Recommendation System (The "User" vs "Trajectory" Matrix)
- I will implement the NearestNeighbors algorithm using a Cosine Similarity metric to identify the top 5 'Success Stories' within the dataset. 
- The system functions as a career trajectory recommender, mapping a user's current demographic profile to the occupations of those who have reached the high-income threshold.

In [20]:
# Initialize the engine
recommender = NearestNeighbors(n_neighbors= 5, metric= 'cosine')
recommender.fit(df_vectorized[user_vector_cols])

0,1,2
,"n_neighbors  n_neighbors: int, default=5 Number of neighbors to use by default for :meth:`kneighbors` queries.",5
,"radius  radius: float, default=1.0 Range of parameter space to use by default for :meth:`radius_neighbors` queries.",1.0
,"algorithm  algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm  based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force.",'auto'
,"leaf_size  leaf_size: int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.",30
,"metric  metric: str or callable, default='minkowski' Metric to use for distance computation. Default is ""minkowski"", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance `_ and the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric values. If metric is ""precomputed"", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only ""nonzero"" elements may be considered neighbors. If metric is a callable function, it takes two arrays representing 1D vectors as inputs and must return one value indicating the distance between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string.",'cosine'
,"p  p: float (positive), default=2 Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.",2
,"metric_params  metric_params: dict, default=None Additional keyword arguments for the metric function.",
,"n_jobs  n_jobs: int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",


## 4.6 Defining e Recommender Function

In [21]:
def get_career_recommendations(user_input_dict, recommender_model, original_df, vectorized_df, user_cols):
    """ Finds the 5 most similar people in the dataset and returns their profiles. """
    # 1. Convert user input dictionary to a DataFrame
    input_df = pd.DataFrame([user_input_dict])
    
    # 2. Match the exact columns of the vectorized data
    # (Creating a row of zeros and filling in the user's specific info)
    user_vector = pd.DataFrame(0, index= [0], columns= user_cols)
    
    # Fill numericals
    for col in ['age', 'education.num', 'hours.per.week']:
        if col in user_input_dict:
            user_vector[col] = user_input_dict[col]
            
    # Fill categoricals (setting the correct One-Hot column to 1)
    # e.g., if sex='Male', it sets 'sex_Male' = 1
    for col, value in user_input_dict.items():
        column_name = f"{col}_{value}"
        if column_name in user_cols:
            user_vector[column_name] = 1

    # 3. Scale the numerical columns using the same scaler used for the dataset
    # Note: Ensure 'scaler' is accessible or passed into the function
    user_vector[['age', 'education.num', 'hours.per.week']] = scaler.transform(user_vector[['age', 'education.num', 'hours.per.week']])

    # 4. Find the neighbors
    distances, indices = recommender_model.kneighbors(user_vector)
    
    # 5. Retrieve the original rows from the full dataset
    recommendations = original_df.iloc[indices[0]].copy()
    recommendations['similarity_distance'] = distances[0]
    
    return recommendations[['occupation', 'education', 'workclass', 'income', 'similarity_distance']]

# Step 5. Testing the Recommender

In [22]:
# Example: A 25-year-old high school grad working 40 hours
new_student = {
    'age': 25,
    'education.num': 9, # HS-grad
    'marital.status': 'Never-married',
    'sex': 'Male',
    'race': 'White',
    'hours.per.week': 40
}

results = get_career_recommendations(new_student, recommender, df, df_vectorized, user_vector_cols)
print("--- People Most Similar To You ---")
print(tabulate(results, headers= "keys", tablefmt= "psql"))

--- People Most Similar To You ---
+-------+-------------------+-------------+------------------+----------+-----------------------+
|       | occupation        | education   | workclass        | income   |   similarity_distance |
|-------+-------------------+-------------+------------------+----------+-----------------------|
| 13658 | Handlers-cleaners | HS-grad     | Private          | <=50K    |           1.11022e-16 |
| 25220 | Machine-op-inspct | HS-grad     | Private          | <=50K    |           1.11022e-16 |
|  9644 | Prof-specialty    | HS-grad     | Self-emp-not-inc | <=50K    |           1.11022e-16 |
| 31858 | Machine-op-inspct | HS-grad     | Private          | <=50K    |           1.11022e-16 |
| 18945 | Prof-specialty    | HS-grad     | Private          | <=50K    |           1.11022e-16 |
+-------+-------------------+-------------+------------------+----------+-----------------------+


In [24]:
# Another test
user_input = {
    'age': 30,
    'education.num': 13, # Bachelors
    'hours.per.week': 40,
    'marital.status': 'Married-civ-spouse',
    'sex': 'Male',
    'native.country': 'United-States'
}

my_recommendations = get_career_recommendations(
    user_input, 
    recommender, 
    df, 
    df_vectorized, 
    user_vector_cols
)

print("--- People Most Similar To You ---")
print(tabulate(my_recommendations, headers= "keys", tablefmt= "psql"))

--- People Most Similar To You ---
+-------+-------------------+-------------+-------------+----------+-----------------------+
|       | occupation        | education   | workclass   | income   |   similarity_distance |
|-------+-------------------+-------------+-------------+----------+-----------------------|
|  3105 | Prof-specialty    | Masters     | Private     | <=50K    |             0.0959693 |
|   824 | Prof-specialty    | Masters     | Private     | >50K     |             0.0959693 |
| 22317 | Prof-specialty    | Masters     | Private     | >50K     |             0.09606   |
| 18014 | Machine-op-inspct | Masters     | Private     | <=50K    |             0.0965935 |
| 15895 | Exec-managerial   | Masters     | Private     | <=50K    |             0.0965935 |
+-------+-------------------+-------------+-------------+----------+-----------------------+


# Step 6. Final Conclusion
1. **First table**:
- **The first testin**, we can see the model found very identical twins as the column similarity_distance shows result almost as zero.
- They all earn <=50K.
- Most work in the Private sector.

2. **Second Table**:
- **The oposite**: the second table shows soemthing interesting, the model didn't find "Identical Twins." It found people who are very similar but not exactly the same.
- The income, two people over the >50K and three below <50K. There is a income diversity here.