In [211]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as p
import seaborn as sns
from scipy.stats import skew

In [213]:
train_data = pd.read_csv("C:/Users/maila/OneDrive/Desktop/linear/train.csv")
test_data = pd.read_csv("C:/Users/maila/OneDrive/Desktop/linear/test.csv")

In [215]:
train_data.head(5)

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0,Tata Tiago 1.2 Revotron XZ WO Alloy,Hyderabad,2017,16500,Petrol,Manual,First,23.84 kmpl,1199 CC,84 bhp,5.0,5.0
1,1,Mahindra TUV 300 2015-2019 mHAWK100 T8 AMT,Kochi,2017,47357,Diesel,Automatic,First,18.49 kmpl,1493 CC,100 bhp,7.0,8.37
2,2,Skoda Rapid 2013-2016 1.6 MPI Ambition,Mumbai,2014,35000,Petrol,Manual,First,15.0 kmpl,1598 CC,103.52 bhp,5.0,4.5
3,3,Tata Indica V2 DLS BSII,Jaipur,2007,200000,Diesel,Manual,Second,17.2 kmpl,1396 CC,53.5 bhp,5.0,0.85
4,4,Tata Manza Club Class Quadrajet90 LX,Mumbai,2013,64000,Diesel,Manual,Second,21.02 kmpl,1248 CC,88.76 bhp,5.0,2.65


In [217]:
test_data.head(5)

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0,Maruti Swift Dzire VDI,Hyderabad,2013,84175,Diesel,Manual,First,23.4 kmpl,1248 CC,74 bhp,5.0,6.95
1,1,Hyundai i10 Sportz AT,Delhi,2012,62282,Petrol,Automatic,First,16.95 kmpl,1197 CC,78.9 bhp,5.0,2.9
2,2,Hyundai Elite i20 Sportz Plus,Kochi,2017,37460,Petrol,Manual,First,18.6 kmpl,1197 CC,81.86 bhp,5.0,6.28
3,3,Volvo XC60 D5,Pune,2011,106976,Diesel,Automatic,First,13.5 kmpl,2400 CC,215 bhp,5.0,13.8
4,4,Hyundai i10 Magna 1.2,Delhi,2013,60000,Petrol,Manual,First,20.36 kmpl,1197 CC,78.9 bhp,5.0,2.75


In [219]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3750 entries, 0 to 3749
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         3750 non-null   int64  
 1   Name               3750 non-null   object 
 2   Location           3750 non-null   object 
 3   Year               3750 non-null   int64  
 4   Kilometers_Driven  3750 non-null   int64  
 5   Fuel_Type          3750 non-null   object 
 6   Transmission       3750 non-null   object 
 7   Owner_Type         3750 non-null   object 
 8   Mileage            3749 non-null   object 
 9   Engine             3722 non-null   object 
 10  Power              3722 non-null   object 
 11  Seats              3718 non-null   float64
 12  Price              3750 non-null   float64
dtypes: float64(2), int64(3), object(8)
memory usage: 381.0+ KB


In [221]:
train_data.isnull().sum()

Unnamed: 0            0
Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               1
Engine               28
Power                28
Seats                32
Price                 0
dtype: int64

In [223]:
train_data = train_data.drop(columns=["Unnamed: 0"])
test_data = test_data.drop(columns=["Unnamed: 0"])

In [225]:
def preprocess_units(column, unit):
    return column.str.replace(" " + unit, "", regex=False).replace("null", None).astype(float)

train_data["Mileage"] = preprocess_units(train_data["Mileage"], "kmpl")
train_data["Engine"] = preprocess_units(train_data["Engine"], "CC")
train_data["Power"] = preprocess_units(train_data["Power"], "bhp")

test_data["Mileage"] = preprocess_units(test_data["Mileage"], "kmpl")
test_data["Engine"] = preprocess_units(test_data["Engine"], "CC")
test_data["Power"] = preprocess_units(test_data["Power"], "bhp")



In [227]:
X_train = train_data.drop(columns=["Price", "Name"])
y_train = train_data["Price"]
X_test = test_data.drop(columns=["Name"])
y_test = test_data["Price"]

In [229]:
X_train = X_train.apply(pd.to_numeric, errors="coerce")
X_test = X_test.apply(pd.to_numeric, errors="coerce") #converting to numeric datatype for all columns
print(X_train.isnull().sum())

Location             3750
Year                    0
Kilometers_Driven       0
Fuel_Type            3750
Transmission         3750
Owner_Type           3750
Mileage                 1
Engine                 28
Power                  98
Seats                  32
dtype: int64


In [231]:
def normalize_data(X):
    return (X - X.mean()) / X.std() #normalizing the values to make it simpler to fit into the ml model

In [233]:
class Linear_Regression:
    

    def __init__ (self , lr = 0.01 , n_iterations  = 10000 , regularization_constant  = 30):
        self.lr = lr
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        self.costs = [] ## keeps track of a MSE in each iteration (which will be used later in the cost function vs epoch graph)
        self.regularization_constant = regularization_constant

    def fit(self, X,y):
        number_of_samples , number_of_features  = X.shape
        #starting off the gradient descent at w1,w2,...,w34 and bias at 0
        self.weights = np.zeros(number_of_features)
        self.bias = 0 
        #gradient descent for 1000 iterations
        
        for _ in range(self.n_iterations):

            y_predict  = np.dot(X , self.weights) + self.bias # we took X and self weights in that specific order since it is a matrix multiplication

            dw  = (1/number_of_samples)*(  np.dot(X.T , y_predict  - y)) + (self.regularization_constant/number_of_samples)*(self.weights)
            # here we had to take a transpose because the shape of X is 183,34 and y predict is 183, and for matrix multiplication
            # we need 34,183 * 183, so we took transpose of matrix X
            #self.regularization constant is the value of lambda
            
            db  = (1/number_of_samples)*np.sum(y_predict  - y )
            #updating the weights
            
            self.weights =self.weights - self.lr*dw
            self.bias= self.bias - self.lr*db
            cost = (1 / (2 * number_of_samples)) * np.sum((y_predict - y) ** 2) + \
                   (self.regularization_constant / (2 * number_of_samples)) * np.sum(self.weights ** 2)
            self.costs.append(cost) 


    def predict(self,X):
        predictions  = np.dot(X,self.weights) +self.bias
        return predictions

In [235]:
class Statistical_Methods:
    def MSE(self, Y, Y_predicted):
        return np.mean((Y - Y_predicted) ** 2)

    def RMSE(self, Y, Y_predicted):
        return np.sqrt(self.MSE(Y, Y_predicted))

    def Rsquared(self, Y, Y_predicted):
        total_variance = np.sum((Y - np.mean(Y)) ** 2)
        unexplained_variance = np.sum((Y - Y_predicted) ** 2)
        return 1 - (unexplained_variance / total_variance)

In [237]:
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)
X_test = pd.DataFrame(X_test)
y_test = pd.Series(y_test)

# Now, convert all columns in X_train and X_test to numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce')
y_train = pd.to_numeric(y_train, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')
y_test = pd.to_numeric(y_test, errors='coerce')

# Check for NaN values after conversion and handle them
print("NaN values in X_train after conversion:", X_train.isna().sum().sum())
print("NaN values in Y_train after conversion:", y_train.isna().sum())
print("NaN values in X_test after conversion:", X_test.isna().sum().sum())
print("NaN values in Y_test after conversion:", y_test.isna().sum())

# Fill NaN values with column means or drop them
X_train = X_train.fillna(X_train.mean())
y_train = y_train.fillna(y_train.mean())
X_test = X_test.fillna(X_test.mean())
y_test = y_test.fillna(y_test.mean())

NaN values in X_train after conversion: 15159
NaN values in Y_train after conversion: 0
NaN values in X_test after conversion: 5039
NaN values in Y_test after conversion: 0


In [239]:
# Replace NaNs with the mean of the columns
X_train = np.where(np.isnan(X_train), np.nanmean(X_train, axis=0), X_train)
X_test = np.where(np.isnan(X_test), np.nanmean(X_test, axis=0), X_test)

  X_train = np.where(np.isnan(X_train), np.nanmean(X_train, axis=0), X_train)
  X_test = np.where(np.isnan(X_test), np.nanmean(X_test, axis=0), X_test)


In [241]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
print("Non-numeric columns in X_train:", X_train.select_dtypes(include=['object']).columns)

Non-numeric columns in X_train: RangeIndex(start=0, stop=0, step=1)


In [243]:
for col in X_train.select_dtypes(include=['object']).columns:
    X_train[col] = X_train[col].fillna(X_train[col].mode()[0])
    X_test[col] = X_test[col].fillna(X_test[col].mode()[0])

In [245]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# Check again to make sure there are no NaNs left
print("NaN values in X_train after filling:", X_train.isna().sum().sum())
print("NaN values in X_test after filling:", X_test.isna().sum().sum())


NaN values in X_train after filling: 0
NaN values in X_test after filling: 0


In [247]:
print(X_train.shape,X_test.shape)

(3750, 10) (1250, 11)


In [249]:
X_test = X_test.drop(columns=[X_test.columns[-1]])

In [251]:
print(X_train.shape,X_test.shape)

(3750, 10) (1250, 10)


In [253]:
print("NaN values in X_train:", np.any(np.isnan(X_train)))
print("NaN values in X_test:", np.any(np.isnan(X_test)))
print("NaN values in y_train:", np.any(np.isnan(y_train)))
print("NaN values in y_test:", np.any(np.isnan(y_test)))
print("Infinite values in X_train:", np.any(np.isinf(X_train)))
print("Infinite values in X_test:", np.any(np.isinf(X_test)))
print("Infinite values in y_train:", np.any(np.isinf(y_train)))
print("Infinite values in y_test:", np.any(np.isinf(y_test)))

NaN values in X_train: False
NaN values in X_test: False
NaN values in y_train: False
NaN values in y_test: False
Infinite values in X_train: False
Infinite values in X_test: False
Infinite values in y_train: False
Infinite values in y_test: False


In [255]:
print("Zero variance features in X_train:", np.any(np.std(X_train, axis=0) == 0))

Zero variance features in X_train: True


In [257]:
non_zero_variance_columns = np.std(X_train, axis=0) != 0
X_train = X_train.loc[:, non_zero_variance_columns]
X_test = X_test[X_train.columns]

In [259]:
print("Zero variance features in X_train:", np.any(np.std(X_train, axis=0) == 0))

Zero variance features in X_train: False


In [261]:
X_train = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
X_test = (X_test - np.mean(X_test, axis=0)) / np.std(X_test, axis=0)

In [263]:
linearreg = Linear_Regression()
linearreg.fit(X_train , y_train)

y_predicted  = linearreg.predict(X_test)
print(y_predicted)
checker = Statistical_Methods()
print("Mean Squared Error:")
print(checker.MSE(y_test , y_predicted))

print("RMS ERROR")
print(checker.RMSE(y_test , y_predicted))

print("R^2 ERROR")
print(checker.Rsquared(y_test, y_predicted))

[ 2.23254985  2.19037276  7.68945034 ... 10.54683786  9.49715102
  9.78790373]
Mean Squared Error:
41.96991192746939
RMS ERROR
6.478418937323318
R^2 ERROR
0.6614747745431129
