In [None]:
import numpy as np # numpy used for mathematical operation on array
import pandas as pd  # pandas used for data manipulation on dataframe
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler

In [None]:
df= pd.read_csv("/content/Salary_Data.csv", nrows=1000)

In [None]:
df.shape

(1000, 29)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                1000 non-null   object 
 1   company                  1000 non-null   object 
 2   level                    1000 non-null   object 
 3   title                    1000 non-null   object 
 4   totalyearlycompensation  1000 non-null   int64  
 5   location                 1000 non-null   object 
 6   yearsofexperience        1000 non-null   float64
 7   yearsatcompany           1000 non-null   float64
 8   tag                      561 non-null    object 
 9   basesalary               1000 non-null   float64
 10  stockgrantvalue          1000 non-null   float64
 11  bonus                    1000 non-null   int64  
 12  gender                   357 non-null    object 
 13  otherdetails             87 non-null     object 
 14  cityid                   

In [None]:
df.isnull().sum()

timestamp                     0
company                       0
level                         0
title                         0
totalyearlycompensation       0
location                      0
yearsofexperience             0
yearsatcompany                0
tag                         439
basesalary                    0
stockgrantvalue               0
bonus                         0
gender                      643
otherdetails                913
cityid                        0
dmaid                         0
rowNumber                     0
Masters_Degree                0
Bachelors_Degree              0
Doctorate_Degree              0
Highschool                    0
Some_College                  0
Race_Asian                    0
Race_White                    0
Race_Two_Or_More              0
Race_Black                    0
Race_Hispanic                 0
Race                       1000
Education                   998
dtype: int64

In [None]:
df=df.drop(["Race","otherdetails","tag"],axis=1)

In [None]:
df.shape

(1000, 26)

In [None]:
df.isnull().sum()

timestamp                    0
company                      0
level                        0
title                        0
totalyearlycompensation      0
location                     0
yearsofexperience            0
yearsatcompany               0
basesalary                   0
stockgrantvalue              0
bonus                        0
gender                     643
cityid                       0
dmaid                        0
rowNumber                    0
Masters_Degree               0
Bachelors_Degree             0
Doctorate_Degree             0
Highschool                   0
Some_College                 0
Race_Asian                   0
Race_White                   0
Race_Two_Or_More             0
Race_Black                   0
Race_Hispanic                0
Education                  998
dtype: int64

In [None]:
# Impute "categorical null values" with Mode value
df['gender'].fillna(df['gender'].mode()[0], inplace=True)
df['Education'].fillna(df['Education'].mode()[0], inplace=True)

In [None]:
Data_dict = pd.DataFrame(df.dtypes)
Data_dict

Unnamed: 0,0
timestamp,object
company,object
level,object
title,object
totalyearlycompensation,int64
location,object
yearsofexperience,float64
yearsatcompany,float64
basesalary,float64
stockgrantvalue,float64


In [None]:
Data_dict['MissingVal'] = df.isnull().sum()
Data_dict

Unnamed: 0,0,MissingVal
timestamp,object,0
company,object,0
level,object,0
title,object,0
totalyearlycompensation,int64,0
location,object,0
yearsofexperience,float64,0
yearsatcompany,float64,0
basesalary,float64,0
stockgrantvalue,float64,0


In [None]:

df.fillna("Not_given", inplace = True)

In [None]:
df.isnull().sum()

timestamp                  0
company                    0
level                      0
title                      0
totalyearlycompensation    0
location                   0
yearsofexperience          0
yearsatcompany             0
basesalary                 0
stockgrantvalue            0
bonus                      0
gender                     0
cityid                     0
dmaid                      0
rowNumber                  0
Masters_Degree             0
Bachelors_Degree           0
Doctorate_Degree           0
Highschool                 0
Some_College               0
Race_Asian                 0
Race_White                 0
Race_Two_Or_More           0
Race_Black                 0
Race_Hispanic              0
Education                  0
dtype: int64

In [None]:
# Droping the unwanted columns

df.drop(columns='timestamp', inplace=True)
df.drop(columns='Masters_Degree', inplace=True)
df.drop(columns='Bachelors_Degree', inplace=True)
df.drop(columns='Doctorate_Degree', inplace=True)
df.drop(columns='Highschool', inplace=True)
df.drop(columns='Some_College', inplace=True)

In [None]:
df.drop(columns='Race_Asian', inplace=True)
df.drop(columns='Race_White', inplace=True)
df.drop(columns='Race_Two_Or_More', inplace=True)
df.drop(columns='Race_Black', inplace=True)
df.drop(columns='Race_Hispanic', inplace=True)
df.drop(columns='level', inplace=True)
df.drop(columns='rowNumber', inplace=True)

In [None]:
df.shape

(1000, 13)

In [None]:
df.columns

Index(['company', 'title', 'totalyearlycompensation', 'location',
       'yearsofexperience', 'yearsatcompany', 'basesalary', 'stockgrantvalue',
       'bonus', 'gender', 'cityid', 'dmaid', 'Education'],
      dtype='object')

In [None]:
from sklearn.preprocessing import LabelEncoder

# Define a categorical feature
company = df['company']
title = df['title']
gender = df['gender']
location = df['location']
Education = df['Education']


# Create a label encoder object
le = LabelEncoder()

# Fit the label encoder to the categorical feature and transform it
df['company'] = le.fit_transform(company)
df['title'] = le.fit_transform(title)
df['gender'] = le.fit_transform(gender)
df['location'] = le.fit_transform(location)
df['Education'] = le.fit_transform(Education)

In [None]:
def find_outliers_IQR(df):
  Q1 = np.percentile(df, 25)
  Q3 = np.percentile(df, 75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5*IQR
  upper_bound = Q3 + 1.5*IQR
  outliers = df[(df > lower_bound) | (df < upper_bound)]
  return outliers, Q1, Q3, IQR

In [None]:
outliers=find_outliers_IQR(df)
print(outliers)

(     company  title  totalyearlycompensation  location  yearsofexperience  \
0         67      3                   127000        53                1.5   
1        123      4                   100000        59                5.0   
2          6      3                   310000        64                8.0   
3          8      5                   372000        67                7.0   
4         57      4                   157000        40                5.0   
..       ...    ...                      ...       ...                ...   
995       32      3                   210000        35                5.0   
996      127      4                   290000        67                6.0   
997       44      4                   150000         3                5.0   
998        8      4                   275000        18                5.0   
999       37      3                   500000        64               16.0   

     yearsatcompany  basesalary  stockgrantvalue  bonus  gender  cityid  \

In [None]:
# Split the data into features and target
features = df.drop(['basesalary'], axis=1).values
target = df['basesalary'].values.reshape(-1, 1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33,  random_state = 2)

In [None]:
# creating a class for Lasso Regression

class Lasso_Regression():

  #initiating the hyperparameters
  def __init__(self, learning_rate, no_of_iterations, lambda_parameter):

    self.learning_rate = learning_rate
    self.no_of_iterations = no_of_iterations
    self.lambda_parameter = lambda_parameter


  # fitting the dataset to the Lasso Regression model
  def fit(self, X, Y):

    # m --> number of Data points --> number of rows
    # n --> number of input features --> number of columns
    self.m, self.n = X.shape

    self.w = np.zeros(self.n)

    self.b = 0

    self.X = X

    self.Y = Y

    # implementing Gradient Descent algorithm for Optimization

    for i in range(self.no_of_iterations):     # missed "self"
      self.upadte_weights()


  # function for updating the weight & bias value
  def upadte_weights(self):

    # linear equation of the model
    Y_prediction = self.predict(self.X)

    # gradients (dw, db)

    # gradient for weight
    dw = np.zeros(self.n)

    for i in range(self.n):

      if self.w[i]>0:

        dw[i] = (-(2*(self.X[:,i]).dot(self.Y - Y_prediction)) + self.lambda_parameter) / self.m 

      else :

        dw[i] = (-(2*(self.X[:,i]).dot(self.Y - Y_prediction)) - self.lambda_parameter) / self.m


    # gradient for bias
    db = - 2 * np.sum(self.Y - Y_prediction) / self.m


    # updating the weights & bias

    self.w = self.w - self.learning_rate*dw
    self.b = self.b - self.learning_rate*db

    


  # Predicting the Target variable
  def predict(self,X):
    return np.dot(X, self.w.reshape(-1,1)) + self.b
  
  def mean_squared_error(self,y_true, y_pred):
   
      # Check if the lengths of both arrays are equal
      if len(y_true) != len(y_pred):
          raise ValueError("Length of y_true and y_pred should be the same.")
      
      # Calculate the squared differences between the true and predicted values
      squared_differences = [(y_true[i] - y_pred[i])**2 for i in range(len(y_true))]
      
      # Calculate the mean of the squared differences
      mse1 = sum(squared_differences) / len(squared_differences)
      
      return mse1
  def r2(self,y_true, y_pred):
    # Calculate the mean of the true values
    y_true_mean = sum(y_true) / len(y_true)
    
    # Calculate the total sum of squares (TSS)
    tss = sum((y_true - y_true_mean) ** 2)
    
    # Calculate the residual sum of squares (RSS)
    rss = sum((y_true - y_pred) ** 2)
    
    # Calculate the R-squared value
    r2_score = 1 - (rss / tss)
    
    return r2_score

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = Lasso_Regression(learning_rate = 0.1, no_of_iterations=1000,
                         lambda_parameter=200)

In [None]:
model.fit(X_train, y_train)

In [None]:
test_data_prediction = model.predict(X_test)

In [None]:
from sklearn import metrics

# R squared error
score_1 = model.r2(y_test, test_data_prediction)

# Mean Absolute Error
score_2 = model.mean_squared_error(y_test, test_data_prediction)

print("R squared error : ", score_1)
print('Mean Absolute Error : ', score_2)

R squared error :  [0.16859972]
Mean Absolute Error :  [4.31275901e+09]


In [None]:
# Importing libraries
  
import numpy as np
  
import pandas as pd
  
from sklearn.model_selection import train_test_split
  
import matplotlib.pyplot as plt
  
# Lasso Regression
  
class LassoRegression() :
      
    def __init__( self, learning_rate, iterations, l1_penality ) :
          
        self.learning_rate = learning_rate
          
        self.iterations = iterations
          
        self.l1_penality = l1_penality
          
    # Function for model training
              
    def fit( self, X, Y ) :
          
        # no_of_training_examples, no_of_features
          
        self.m, self.n = X.shape
          
        # weight initialization
          
        self.W = np.zeros( self.n )
          
        self.b = 0
          
        self.X = X
          
        self.Y = Y
          
        # gradient descent learning
                  
        for i in range( self.iterations ) :
              
            self.update_weights()
              
        return self
      
    # Helper function to update weights in gradient descent
      
    def update_weights( self ) :
             
        Y_pred = self.predict( self.X )
          
        # calculate gradients  
          
        dW = np.zeros( self.n )
          
        for j in range( self.n ) :
              
            if self.W[j] > 0 :
                  
                dW[j] = ( - ( 2 * ( self.X[:, j] ).dot( self.Y - Y_pred ) ) 
                           
                         + self.l1_penality ) / self.m
          
            else :
                  
                dW[j] = ( - ( 2 * ( self.X[:, j] ).dot( self.Y - Y_pred ) ) 
                           
                         - self.l1_penality ) / self.m
  
       
        db = - 2 * np.sum( self.Y - Y_pred ) / self.m 
          
        # update weights
      
        self.W = self.W - self.learning_rate * dW
      
        self.b = self.b - self.learning_rate * db
          
        return self
      
    # Hypothetical function  h( x ) 
      
    def predict( self, X ) :
      
        return X.dot( self.W ) + self.b

In [None]:
# Split the data into features and target
X = df.drop(['basesalary'], axis=1).values
y = df['basesalary'].values.reshape(-1, 1)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split( X, y, test_size = 1 / 3, random_state = 0 )

In [None]:
model = LassoRegression( iterations = 1000, learning_rate = 0.01, l1_penality = 500 )

In [None]:
y_train = y_train[:X_train.shape[0]]

In [None]:
model.fit( X_train, y_train )

In [None]:
X_train.shape


In [None]:
Y_train.shape