<a href="https://colab.research.google.com/github/Alija2711/Sample-1/blob/main/intermediate2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

train_df = pd.read_csv('/content/train_LZdllcl.csv')
test_df = pd.read_csv('/content/test_2umaH9m.csv')

print(train_df.head())
print(train_df.describe())
print(train_df.info())

numerical_features = train_df.select_dtypes(include=np.number).columns
numerical_features_test = numerical_features.drop('is_promoted', errors='ignore')
train_df[numerical_features] = train_df[numerical_features].fillna(train_df[numerical_features].mean())
test_df[numerical_features_test] = test_df[numerical_features_test].fillna(test_df[numerical_features_test].mean())

categorical_features = train_df.select_dtypes(include=['object']).columns
for feature in categorical_features:
    mode = train_df[feature].mode()[0]
    train_df[feature].fillna(mode, inplace=True)
    test_df[feature].fillna(mode, inplace=True)

categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = train_df.select_dtypes(include=['number']).columns.tolist()
numerical_cols.remove('is_promoted')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

X_train_processed = preprocessor.fit_transform(train_df.drop('is_promoted', axis=1))
X_test_processed = preprocessor.transform(test_df)
y_train = train_df['is_promoted']


   employee_id         department     region         education gender  \
0        65438  Sales & Marketing   region_7  Master's & above      f   
1        65141         Operations  region_22        Bachelor's      m   
2         7513  Sales & Marketing  region_19        Bachelor's      m   
3         2542  Sales & Marketing  region_23        Bachelor's      m   
4        48945         Technology  region_26        Bachelor's      m   

  recruitment_channel  no_of_trainings  age  previous_year_rating  \
0            sourcing                1   35                   5.0   
1               other                1   30                   5.0   
2            sourcing                1   34                   3.0   
3               other                2   39                   1.0   
4               other                1   45                   3.0   

   length_of_service  KPIs_met >80%  awards_won?  avg_training_score  \
0                  8              1            0                  49   
1 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[feature].fillna(mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[feature].fillna(mode, inplace=True)


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
X_train, X_val, y_train, y_val = train_test_split(X_train_processed, y_train, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))
print('Precision:', precision_score(y_val, y_pred))
print('Recall:', recall_score(y_val, y_pred))
print('F1 Score:', f1_score(y_val, y_pred))


Accuracy: 0.9355956942163839
Precision: 0.9105691056910569
Recall: 0.24669603524229075
F1 Score: 0.3882149046793761


In [20]:
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)
best_params = random_search.best_params_
best_score = random_search.best_score_
print('Best Parameters:', best_params)
print('Best Score:', best_score)
best_rf_model = random_search.best_estimator_


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}
Best Score: 0.9325821862268828


In [21]:
test_pred = best_rf_model.predict(X_test_processed)


In [22]:

submission_df = pd.read_csv('/content/sample_submission_M0L0uXE (1).csv')
submission_df['is_promoted'] = test_pred
submission_df.to_csv('final_submission.csv', index=False)
print("Submission file created successfully!")


Submission file created successfully!
