In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
# Load the dataset 
file_path = ("Predict Hair Fall.csv")
df = pd.read_csv(file_path)

print(df.head())

       Id Genetics Hormonal Changes Medical Conditions  \
0  133992      Yes               No            No Data   
1  148393       No               No             Eczema   
2  155074       No               No         Dermatosis   
3  118261      Yes              Yes           Ringworm   
4  111915       No               No          Psoriasis   

  Medications & Treatments Nutritional Deficiencies     Stress  Age  \
0                  No Data      Magnesium deficiency  Moderate   19   
1              Antibiotics      Magnesium deficiency      High   43   
2         Antifungal Cream        Protein deficiency  Moderate   26   
3              Antibiotics        Biotin Deficiency   Moderate   46   
4                 Accutane           Iron deficiency  Moderate   30   

  Poor Hair Care Habits  Environmental Factors Smoking Weight Loss   Hair Loss  
0                    Yes                   Yes      No           No          0  
1                    Yes                   Yes      No        

In [3]:
# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("&", "and")

print("\nUpdated Column Names:", df.columns.tolist())



Updated Column Names: ['id', 'genetics', 'hormonal_changes', 'medical_conditions', 'medications_and_treatments', 'nutritional_deficiencies', 'stress', 'age', 'poor_hair_care_habits', 'environmental_factors', 'smoking', 'weight_loss', 'hair_loss']


In [4]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns that need encoding
categorical_columns = ["genetics", "hormonal_changes", "stress", "poor_hair_care_habits",
                       "environmental_factors", "smoking", "weight_loss",
                       "medical_conditions", "medications_and_treatments", "nutritional_deficiencies"]

# Apply Label Encoding
label_encoders = {}
for col in categorical_columns:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))  # Convert to string 
        label_encoders[col] = le
        print(f" Encoded '{col}' successfully!")
    else:
        print(f" Warning: Column '{col}' not found in dataset!")


 Encoded 'genetics' successfully!
 Encoded 'hormonal_changes' successfully!
 Encoded 'stress' successfully!
 Encoded 'poor_hair_care_habits' successfully!
 Encoded 'environmental_factors' successfully!
 Encoded 'smoking' successfully!
 Encoded 'weight_loss' successfully!
 Encoded 'medical_conditions' successfully!
 Encoded 'medications_and_treatments' successfully!
 Encoded 'nutritional_deficiencies' successfully!


In [5]:
X = df.drop(columns=["id", "hair_loss"], errors="ignore")  # Drop ID and Target column 'hair_loss'
y = df["hair_loss"]  # Target variable 

# Ensure all feature columns are numeric
X = X.apply(pd.to_numeric, errors="coerce")

# Handle missing values if any columns contain NaN values 
X.fillna(0, inplace=True)  

print("\nFeature Data Types:\n", X.dtypes)  
print("\nFeature Sample:\n", X.head())  

# Split the data into training and testing sets (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")



Feature Data Types:
 genetics                      int32
hormonal_changes              int32
medical_conditions            int32
medications_and_treatments    int32
nutritional_deficiencies      int32
stress                        int32
age                           int64
poor_hair_care_habits         int32
environmental_factors         int32
smoking                       int32
weight_loss                   int32
dtype: object

Feature Sample:
    genetics  hormonal_changes  medical_conditions  medications_and_treatments  \
0         1                 0                   5                           8   
1         0                 0                   4                           1   
2         0                 0                   3                           3   
3         1                 1                   7                           1   
4         0                 0                   6                           0   

   nutritional_deficiencies  stress  age  poor_hair_care_habits

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"\n Model Accuracy: {accuracy:.2f}")



 Model Accuracy: 0.52


In [7]:
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Assuming X and y are features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the model
model = RandomForestClassifier()  
model.fit(X_train, y_train)

# predictions using the trained model
y_pred = model.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.65      0.53        89
           1       0.57      0.37      0.45       111

    accuracy                           0.49       200
   macro avg       0.51      0.51      0.49       200
weighted avg       0.52      0.49      0.49       200


Confusion Matrix:
 [[58 31]
 [70 41]]


In [8]:
import joblib

# Save the trained model
joblib.dump(model, 'hair_model.pkl')

# To load the saved model later:
# loaded_model = joblib.load('hair_loss_prediction_model.pkl')


['hair_model.pkl']

In [9]:
import pandas as pd
import joblib

# Load trained model
model = joblib.load("hair_model.pkl")

feature_names = ["genetics", "hormonal_changes", "medical_conditions", 
                 "medications_and_treatments", "nutritional_deficiencies", 
                 "stress", "age", "poor_hair_care_habits", 
                 "environmental_factors", "smoking", "weight_loss"]

stress_mapping = {"low": 0, "moderate": 1, "high": 2}

# Example input data 
new_data = [[1, 0, 1, 0, 1, stress_mapping["high"], 30, 1, 0, 1, 0]]  

new_data_df = pd.DataFrame(new_data, columns=feature_names)

# Make prediction
prediction = model.predict(new_data_df)

print("\nPrediction for New Data:", "Hair Loss" if prediction[0] == 1 else "No Hair Loss")



Prediction for New Data: Hair Loss


In [11]:
import pandas as pd
import joblib

# Load trained model
model = joblib.load("hair_model.pkl")

feature_names = ["genetics", "hormonal_changes", "medical_conditions", 
                 "medications_and_treatments", "nutritional_deficiencies", 
                 "stress", "age", "poor_hair_care_habits", 
                 "environmental_factors", "smoking", "weight_loss"]

# Define mapping for categorical values
stress_mapping = {"low": 0, "moderate": 1, "high": 2}

# Function to safely get integer input
def get_int_input(prompt):
    while True:
        value = input(prompt).strip()
        if value.isdigit():  # Check if input is a valid number
            return int(value)
        print("Invalid input! Please enter a valid number.")

# Function to get user input with validation
def get_user_input():
    print("\nEnter the following details:")
    genetics = get_int_input("Genetics (1 for Yes, 0 for No): ")
    hormonal_changes = get_int_input("Hormonal Changes (1 for Yes, 0 for No): ")
    medical_conditions = get_int_input("Medical Conditions (1 for Yes, 0 for No): ")
    medications_and_treatments = get_int_input("Medications & Treatments (1 for Yes, 0 for No): ")
    nutritional_deficiencies = get_int_input("Nutritional Deficiencies (1 for Yes, 0 for No): ")
    
    while True:  
        stress = input("Stress Level (low, moderate, high): ").strip().lower()
        if stress in stress_mapping:
            break
        print("Invalid input! Please enter 'low', 'moderate', or 'high'.")

    age = get_int_input("Age: ")
    poor_hair_care_habits = get_int_input("Poor Hair Care Habits (1 for Yes, 0 for No): ")
    environmental_factors = get_int_input("Environmental Factors (1 for Yes, 0 for No): ")
    smoking = get_int_input("Smoking (1 for Yes, 0 for No): ")
    weight_loss = get_int_input("Weight Loss (1 for Yes, 0 for No): ")

    # Convert stress to numerical value
    stress_numeric = stress_mapping[stress]

    return [[genetics, hormonal_changes, medical_conditions, 
             medications_and_treatments, nutritional_deficiencies, 
             stress_numeric, age, poor_hair_care_habits, 
             environmental_factors, smoking, weight_loss]]

# Get user input
new_data = get_user_input()

new_data_df = pd.DataFrame(new_data, columns=feature_names)

# Make prediction
prediction = model.predict(new_data_df)

print("\nPrediction for New Data:", "Hair Loss" if prediction[0] == 1 else "No Hair Loss")



Enter the following details:

Prediction for New Data: Hair Loss
