In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
#Step1 Load The Dataset
train_df = pd.read_csv('train.csv')

In [3]:
train_df.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52


In [4]:
# Define your features (X) and target (y) from the training data
x_train = train_df.drop(columns=['Burn Rate']) #independent variable
y_train = train_df['Burn Rate'] #Dependent Variable

In [5]:
#Filling null values with mean
y_train = y_train.fillna(y_train.mean())

In [6]:
x_train.columns

Index(['Employee ID', 'Date of Joining', 'Gender', 'Company Type',
       'WFH Setup Available', 'Designation', 'Resource Allocation',
       'Mental Fatigue Score'],
      dtype='object')

In [7]:
# Step 2: Preprocessing (handling categorical variables, missing values, and scaling)
preprocessor = ColumnTransformer(
    transformers=[
        # Apply StandardScaler to numerical columns
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
            ('scaler', StandardScaler())  # Apply standard scaling
        ]), ['Designation', 'Resource Allocation','Mental Fatigue Score']),  # Handle numerical columns
        
        # Apply OneHotEncoder to categorical columns
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Employee ID', 'Date of Joining', 'Gender', 'Company Type','WFH Setup Available'])  # Handle categorical columns
    ])


In [8]:
# Step 3: Create a pipeline that first applies preprocessing and then fits the Linear Regression model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())  # Linear regression model
])

In [9]:
# Step 4: Train the model on the entire training dataset
model.fit(x_train, y_train)

In [10]:
test_df = pd.read_csv('test.csv')

In [11]:
# Step 6: Preprocess the test data and make predictions
test_predictions = model.predict(test_df)

In [12]:
# Step 7: Add the predicted Burn rate to the test data and save the result
test_df['Predicted_Burn_Rate'] = test_predictions

In [13]:
test_df.to_csv('test_with_predictions.csv', index=False)

print("Predictions have been added to the test data and saved to 'test_with_predictions.csv'.")

Predictions have been added to the test data and saved to 'test_with_predictions.csv'.


In [14]:
model.score(x_train,y_train)

0.9999999721379857