In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [4]:
# Load the dataset
file_path = 'data.csv'
data = pd.read_csv(file_path)

In [6]:
# Display dataset information for initial exploration
data_info = data.info()
data_head = data.head()
data_description = data.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   EducationSector         219 non-null    object
 1   IndividualProject       219 non-null    object
 2   Age                     219 non-null    int64 
 3   Gender                  219 non-null    object
 4   City                    219 non-null    object
 5   Influenced              219 non-null    object
 6   Perseverance            219 non-null    int64 
 7   DesireToTakeInitiative  219 non-null    int64 
 8   Competitiveness         219 non-null    int64 
 9   SelfReliance            219 non-null    int64 
 10  StrongNeedToAchieve     219 non-null    int64 
 11  SelfConfidence          219 non-null    int64 
 12  GoodPhysicalHealth      219 non-null    int64 
 13  MentalDisorder          219 non-null    object
 14  KeyTraits               219 non-null    object
 15  Reason

In [8]:
# Separate features and target variable
X = data.drop(columns=['y', 'ReasonsForLack'])
y = data['y']

In [10]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [12]:
# Preprocess categorical data: Label encode categorical features
for col in categorical_cols:
    X[col] = LabelEncoder().fit_transform(X[col])

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

In [22]:
# Evaluate each model using cross-validation
model_scores = {}
for model_name, model in models.items():
    # Perform cross-validation and store average accuracy
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    model_scores[model_name] = np.mean(cv_scores)

In [24]:
# Select the best model based on cross-validation score
best_model_name = max(model_scores, key=model_scores.get)
best_model = models[best_model_name]
best_model.fit(X_train, y_train)

In [26]:
# If the model has feature_importances_, retrieve them for feature importance analysis
if hasattr(best_model, 'feature_importances_'):
    feature_importances = best_model.feature_importances_
    important_features = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
else:
    important_features = None

# Display results
print("Model Scores:", model_scores)
print("Best Model:", best_model_name)
print("Important Features:\n", important_features)

Model Scores: {'Logistic Regression': 0.5542857142857142, 'Decision Tree': 0.5885714285714285, 'Random Forest': 0.5885714285714285}
Best Model: Decision Tree
Important Features:
                    Feature  Importance
2                      Age    0.130643
7   DesireToTakeInitiative    0.103550
14               KeyTraits    0.096102
5               Influenced    0.081978
11          SelfConfidence    0.080696
8          Competitiveness    0.079744
10     StrongNeedToAchieve    0.077780
0          EducationSector    0.075960
3                   Gender    0.056948
12      GoodPhysicalHealth    0.054980
9             SelfReliance    0.049057
6             Perseverance    0.040373
4                     City    0.038486
1        IndividualProject    0.027359
13          MentalDisorder    0.006343


In [30]:
import pickle

# Save the model and feature names
model_filename = 'decision_tree_model.pkl'
feature_names = X.columns.tolist()

with open(model_filename, 'wb') as file:
    pickle.dump((best_model, feature_names), file)

print("Model and feature names saved.")

Model and feature names saved.


In [32]:
print(feature_names)

['EducationSector', 'IndividualProject', 'Age', 'Gender', 'City', 'Influenced', 'Perseverance', 'DesireToTakeInitiative', 'Competitiveness', 'SelfReliance', 'StrongNeedToAchieve', 'SelfConfidence', 'GoodPhysicalHealth', 'MentalDisorder', 'KeyTraits']
