# Data Processing

In [1]:
# Calling libraries 

import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sbs
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df=pd.read_csv("train.csv")

In [3]:
df.head(10)


Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0
5,5,29.0,Male,45963.0,Married,1.0,Bachelor's,,33.053198,Urban,...,2.0,4.0,614.0,5.0,2022-05-20 15:21:39.207847,Average,No,Weekly,House,3202.0
6,6,41.0,Male,40336.0,Married,0.0,PhD,,,Rural,...,2.0,8.0,807.0,6.0,2020-02-21 15:21:39.219432,Poor,No,Weekly,House,439.0
7,7,48.0,Female,127237.0,Divorced,2.0,High School,Employed,5.769783,Suburban,...,1.0,11.0,398.0,5.0,2022-08-08 15:21:39.181605,Average,No,Rarely,Condo,111.0
8,8,21.0,Male,1733.0,Divorced,3.0,Bachelor's,,17.869551,Urban,...,1.0,10.0,685.0,8.0,2020-12-14 15:21:39.198406,Average,No,Monthly,Condo,213.0
9,9,44.0,Male,52447.0,Married,2.0,Master's,Employed,20.473718,Urban,...,1.0,9.0,635.0,3.0,2020-08-02 15:21:39.144722,Poor,No,Daily,Condo,64.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [9]:
#Missing values

missing_columns=df.isnull().sum()
missing_columns=missing_columns[missing_columns>0]
if not missing_columns.empty:
    print(f"Missing values in dataset:\n{missing_columns}")
else:
    print("No Missing values in dataset")

Missing values in dataset:
Age                      18705
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Occupation              358075
Health Score             74076
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Customer Feedback        77824
dtype: int64


In [14]:
# Nurical and categorical columns

numerical_columns = df.select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_columns = df.select_dtypes(include=["object","category"]).columns.tolist()

print(f"Numerical Columns:\n{numerical_columns}")
print(f"Total number of numerical columns: {len(numerical_columns)}")

print(f"\nCategorical Columns:\n{categorical_columns}")
print(f"Total number of categorical columns: {len(categorical_columns)}")

Numerical Columns:
['id', 'Age', 'Annual Income', 'Number of Dependents', 'Health Score', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 'Premium Amount']
Total number of numerical columns: 10

Categorical Columns:
['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']
Total number of categorical columns: 11


In [17]:
df['Policy Start Date'].unique()

array(['2023-12-23 15:21:39.134960', '2023-06-12 15:21:39.111551',
       '2023-09-30 15:21:39.221386', ..., '2021-04-28 15:21:39.129190',
       '2019-11-14 15:21:39.201446', '2020-10-19 15:21:39.118178'],
      shape=(167381,), dtype=object)

In [26]:
# Cleaning of "Policy Start Date" columns

df["Policy Start Date"]=pd.to_datetime(df["Policy Start Date"], errors="coerce")

df["Year of Policy Start Date"]=df["Policy Start Date"].dt.year
df["Month of Policy Start Date"]=df["Policy Start Date"].dt.month
df["Day of Policy Start Date"]=df["Policy Start Date"].dt.day

df.drop("Policy Start Date",axis=1,inplace=True)
print(df.head(3))

   id   Age  Gender  Annual Income Marital Status  Number of Dependents  \
0   0  19.0  Female        10049.0        Married                   1.0   
1   1  39.0  Female        31678.0       Divorced                   3.0   
2   2  23.0    Male        25602.0       Divorced                   3.0   

  Education Level     Occupation  Health Score  Location  ... Credit Score  \
0      Bachelor's  Self-Employed     22.598761     Urban  ...        372.0   
1        Master's            NaN     15.569731     Rural  ...        694.0   
2     High School  Self-Employed     47.177549  Suburban  ...          NaN   

   Insurance Duration  Customer Feedback  Smoking Status  Exercise Frequency  \
0                 5.0               Poor              No              Weekly   
1                 2.0            Average             Yes             Monthly   
2                 3.0               Good             Yes              Weekly   

  Property Type Premium Amount Year of Policy Start Date  \
0    

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 23 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   id                          1200000 non-null  int64  
 1   Age                         1181295 non-null  float64
 2   Gender                      1200000 non-null  object 
 3   Annual Income               1155051 non-null  float64
 4   Marital Status              1181471 non-null  object 
 5   Number of Dependents        1090328 non-null  float64
 6   Education Level             1200000 non-null  object 
 7   Occupation                  841925 non-null   object 
 8   Health Score                1125924 non-null  float64
 9   Location                    1200000 non-null  object 
 10  Policy Type                 1200000 non-null  object 
 11  Previous Claims             835971 non-null   float64
 12  Vehicle Age                 1199994 non-null  float64
 1

# Handling Missing Values

In [29]:
# Null qiymatlarni bor ustunlar to'ldirish 

missing_columns=df.columns[df.isnull().sum()>0]

for col in missing_columns:
    if df[col].dtype=="object":
        df[col].fillna(df[col].mode()[0],inplace=True)
    else:
        df[col].fillna(df[col].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0],inplace=True)


In [30]:
print(df.head(10))

   id   Age  Gender  Annual Income Marital Status  Number of Dependents  \
0   0  19.0  Female        10049.0        Married                   1.0   
1   1  39.0  Female        31678.0       Divorced                   3.0   
2   2  23.0    Male        25602.0       Divorced                   3.0   
3   3  21.0    Male       141855.0        Married                   2.0   
4   4  21.0    Male        39651.0         Single                   1.0   
5   5  29.0    Male        45963.0        Married                   1.0   
6   6  41.0    Male        40336.0        Married                   0.0   
7   7  48.0  Female       127237.0       Divorced                   2.0   
8   8  21.0    Male         1733.0       Divorced                   3.0   
9   9  44.0    Male        52447.0        Married                   2.0   

  Education Level     Occupation  Health Score  Location  ... Credit Score  \
0      Bachelor's  Self-Employed     22.598761     Urban  ...    372.00000   
1        Master's 

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 23 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   id                          1200000 non-null  int64  
 1   Age                         1200000 non-null  float64
 2   Gender                      1200000 non-null  object 
 3   Annual Income               1200000 non-null  float64
 4   Marital Status              1200000 non-null  object 
 5   Number of Dependents        1200000 non-null  float64
 6   Education Level             1200000 non-null  object 
 7   Occupation                  1200000 non-null  object 
 8   Health Score                1200000 non-null  float64
 9   Location                    1200000 non-null  object 
 10  Policy Type                 1200000 non-null  object 
 11  Previous Claims             1200000 non-null  float64
 12  Vehicle Age                 1200000 non-null  float64
 1

In [32]:
missing_columns=df.isnull().sum()
missing_columns=missing_columns[missing_columns>0]
if not missing_columns.empty:
    print(f"Missing values in dataset:\n{missing_columns}")
else:
    print("No Missing values in dataset")

No Missing values in dataset


# Encoding and Scaling

In [38]:

categorical_col = df.select_dtypes(include=["object","category"]).columns.tolist()
categorical_col




['Gender',
 'Marital Status',
 'Education Level',
 'Occupation',
 'Location',
 'Policy Type',
 'Customer Feedback',
 'Smoking Status',
 'Exercise Frequency',
 'Property Type']

In [41]:
# Encoding
# cardinality of categorical columns

cardinality = df[categorical_columns].nunique()


print(cardinality)




Gender                2
Marital Status        3
Education Level       4
Occupation            3
Location              3
Policy Type           3
Customer Feedback     3
Smoking Status        2
Exercise Frequency    4
Property Type         3
dtype: int64


In [45]:
# OneHot Encoding (get_dummies)

one_hot_encoding=pd.get_dummies(df,columns=categorical_columns,drop_first=True)
print(one_hot_encoding)

              id   Age  Annual Income  Number of Dependents  Health Score  \
0              0  19.0   10049.000000              1.000000     22.598761   
1              1  39.0   31678.000000              3.000000     15.569731   
2              2  23.0   25602.000000              3.000000     47.177549   
3              3  21.0  141855.000000              2.000000     10.938144   
4              4  21.0   39651.000000              1.000000     20.376094   
...          ...   ...            ...                   ...           ...   
1199995  1199995  36.0   27316.000000              0.000000     13.772907   
1199996  1199996  54.0   35786.000000              2.009934     11.483482   
1199997  1199997  19.0   51884.000000              0.000000     14.724469   
1199998  1199998  55.0   32745.217777              1.000000     18.547381   
1199999  1199999  21.0   32745.217777              0.000000     10.125323   

         Previous Claims  Vehicle Age  Credit Score  Insurance Duration  \


In [46]:
df=one_hot_encoding
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 33 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   id                           1200000 non-null  int64  
 1   Age                          1200000 non-null  float64
 2   Annual Income                1200000 non-null  float64
 3   Number of Dependents         1200000 non-null  float64
 4   Health Score                 1200000 non-null  float64
 5   Previous Claims              1200000 non-null  float64
 6   Vehicle Age                  1200000 non-null  float64
 7   Credit Score                 1200000 non-null  float64
 8   Insurance Duration           1200000 non-null  float64
 9   Premium Amount               1200000 non-null  float64
 10  Year of Policy Start Date    1200000 non-null  int32  
 11  Month of Policy Start Date   1200000 non-null  int32  
 12  Day of Policy Start Date     1200000 non-n

In [None]:
df[df.columns]=df[df.columns].astype(int)


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 33 columns):
 #   Column                       Non-Null Count    Dtype
---  ------                       --------------    -----
 0   id                           1200000 non-null  int64
 1   Age                          1200000 non-null  int64
 2   Annual Income                1200000 non-null  int64
 3   Number of Dependents         1200000 non-null  int64
 4   Health Score                 1200000 non-null  int64
 5   Previous Claims              1200000 non-null  int64
 6   Vehicle Age                  1200000 non-null  int64
 7   Credit Score                 1200000 non-null  int64
 8   Insurance Duration           1200000 non-null  int64
 9   Premium Amount               1200000 non-null  int64
 10  Year of Policy Start Date    1200000 non-null  int64
 11  Month of Policy Start Date   1200000 non-null  int64
 12  Day of Policy Start Date     1200000 non-null  int64
 13  Gender_Male 

In [55]:
categorical_col = df.select_dtypes(include=["object","category"]).columns.tolist()
categorical_col

[]

In [58]:
# Scaling
# MinMaxScaler

min_max_scaler=MinMaxScaler()
min_max_scaled=pd.DataFrame(min_max_scaler.fit_transform(df),columns=df.columns)


In [60]:
df=min_max_scaled

In [64]:
print(df.head(1))

    id       Age  Annual Income  Number of Dependents  Health Score  \
0  0.0  0.021739       0.066988                  0.25      0.357143   

   Previous Claims  Vehicle Age  Credit Score  Insurance Duration  \
0         0.222222     0.894737      0.131148                 0.5   

   Premium Amount  ...  Policy Type_Comprehensive  Policy Type_Premium  \
0        0.572203  ...                        0.0                  1.0   

   Customer Feedback_Good  Customer Feedback_Poor  Smoking Status_Yes  \
0                     0.0                     1.0                 0.0   

   Exercise Frequency_Monthly  Exercise Frequency_Rarely  \
0                         0.0                        0.0   

   Exercise Frequency_Weekly  Property Type_Condo  Property Type_House  
0                        1.0                  0.0                  1.0  

[1 rows x 33 columns]


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 33 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   id                           1200000 non-null  float64
 1   Age                          1200000 non-null  float64
 2   Annual Income                1200000 non-null  float64
 3   Number of Dependents         1200000 non-null  float64
 4   Health Score                 1200000 non-null  float64
 5   Previous Claims              1200000 non-null  float64
 6   Vehicle Age                  1200000 non-null  float64
 7   Credit Score                 1200000 non-null  float64
 8   Insurance Duration           1200000 non-null  float64
 9   Premium Amount               1200000 non-null  float64
 10  Year of Policy Start Date    1200000 non-null  float64
 11  Month of Policy Start Date   1200000 non-null  float64
 12  Day of Policy Start Date     1200000 non-n

# Training and Predicting

In [67]:
#Date split
x=df.drop(columns="Premium Amount") # Input (Features)
y=df["Premium Amount"] # Output (Target variable)

In [72]:
x.head(1)


Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Year of Policy Start Date,...,Policy Type_Comprehensive,Policy Type_Premium,Customer Feedback_Good,Customer Feedback_Poor,Smoking Status_Yes,Exercise Frequency_Monthly,Exercise Frequency_Rarely,Exercise Frequency_Weekly,Property Type_Condo,Property Type_House
0,0.0,0.021739,0.066988,0.25,0.357143,0.222222,0.894737,0.131148,0.5,0.8,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [75]:
print(f"Output Column:\n{y.head(10)}")

Output Column:
0    0.572203
1    0.293834
2    0.109861
3    0.149628
4    0.402089
5    0.639084
6    0.084153
7    0.018277
8    0.038763
9    0.008837
Name: Premium Amount, dtype: float64


In [82]:
x_train, x_temp, y_train, y_temp=train_test_split(x,y, test_size=0.2, random_state=42)

x_test, x_val, y_test, y_val=train_test_split(x_temp,y_temp, test_size=0.5, random_state=42)

In [89]:
x_train.shape

(960000, 32)

In [90]:
x_test.shape

(120000, 32)

In [92]:
x_val.shape

(120000, 32)

In [91]:
y_train.shape

(960000,)

In [93]:
# Model selecting
model=LinearRegression()

#Traing with model (80% in dataset)
model.fit(x_train,y_train)



In [95]:
# Model Prediction
y_pred = model.predict(x_test)

In [96]:
print(y) # old features

0          0.572203
1          0.293834
2          0.109861
3          0.149628
4          0.402089
             ...   
1199995    0.257682
1199996    0.160876
1199997    0.070496
1199998    0.115686
1199999    0.494075
Name: Premium Amount, Length: 1200000, dtype: float64


In [98]:
y_pred[0]

np.float64(0.23123856087588496)

In [99]:
#Modelni baxolash(model evaluation)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [100]:
print(f"\nModel Evaluation Metrics:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")


Model Evaluation Metrics:
Mean Squared Error (MSE): 0.03
Mean Absolute Error (MAE): 0.13
R² Score: 0.00
