<a href="https://colab.research.google.com/github/Bhanugt/DecisionTreee-prediction-streamlit/blob/main/decisiontreee_prediction_streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Define features (X) and target (y)
target_col = "impact"
X = df.drop(columns=[target_col], errors="ignore")
y = df[target_col]

# Remove constant/low-variance features
constant_filter = VarianceThreshold(threshold=0.01)
X = constant_filter.fit_transform(X)

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Feature selection using SelectKBest
selector = SelectKBest(f_classif, k=10)  # Selects top 10 features
X = selector.fit_transform(X, y)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Define cross-validation strategy (Stratified K-Fold for balanced class distribution)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Decision Tree model
dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(dt, X_resampled, y_resampled, cv=cv, scoring='accuracy')

# Train model on full training set
dt.fit(X_resampled, y_resampled)

# Evaluate model
y_pred = dt.predict(X_resampled)
accuracy = accuracy_score(y_resampled, y_pred)

# Print Cross-Validation Results
print("\nCross-Validation Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))
print("Standard Deviation:", np.std(cv_scores))

# Final Model Performance
print("\nDecision Tree Performance (After Cross-Validation):")
print("Accuracy:", accuracy)
print("Precision:", precision_score(y_resampled, y_pred, average='weighted'))
print("Recall:", recall_score(y_resampled, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_resampled, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_resampled, y_pred))



Cross-Validation Scores: [0.3347216  0.3383906  0.33860571 0.33761404 0.33875446]
Mean Accuracy: 0.3376172836949435
Standard Deviation: 0.0015002109039102594

Decision Tree Performance (After Cross-Validation):
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     33614
           1       1.00      1.00      1.00     33614
           2       1.00      1.00      1.00     33614

    accuracy                           1.00    100842
   macro avg       1.00      1.00      1.00    100842
weighted avg       1.00      1.00      1.00    100842



In [5]:
from google.colab import files
files.download("decision_tree_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
!zip -r decision_tree_project.zip decision_tree_model.pkl streamlit_app.py requirements.txt
from google.colab import files
files.download("decision_tree_project.zip")


  adding: decision_tree_model.pkl (deflated 86%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>