<a href="https://colab.research.google.com/github/Dcodinginsane/Pandas-for-Data-Analysis/blob/main/Logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic regression
Logistic regression is used for binary classification problems, where the dependent variable takes on one of two values. It models the probability of the dependent variable taking on one of the two values as a function of the independent variables.

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [2]:
# Load data from csv file
data = pd.read_csv("/content/loan_default_data.csv")
data

Unnamed: 0,credit_score,income,employment_status,education_level,default
0,650,50000,employed,bachelor,0
1,720,70000,employed,master,0
2,550,35000,unemployed,high school,1
3,600,45000,employed,bachelor,0
4,710,80000,employed,master,0
5,680,65000,employed,bachelor,0
6,500,25000,unemployed,high school,1
7,630,55000,self-employed,bachelor,0
8,720,90000,employed,master,0
9,550,40000,unemployed,high school,1


In [3]:
# one-hot encode the employement_status column
one_hot = pd.get_dummies(data['employment_status'])
data = data.drop('employment_status' , axis=1)
data = pd.concat([data , one_hot] , axis=1)

In [5]:
# One-hot encode the education_level column
one_hot = pd.get_dummies(data['education_level'], prefix='education')
data = pd.concat([data, one_hot], axis=1)
data.drop('education_level', axis=1, inplace=True)

In [6]:
data

Unnamed: 0,credit_score,income,default,employed,self-employed,unemployed,education_bachelor,education_high school,education_master
0,650,50000,0,1,0,0,1,0,0
1,720,70000,0,1,0,0,0,0,1
2,550,35000,1,0,0,1,0,1,0
3,600,45000,0,1,0,0,1,0,0
4,710,80000,0,1,0,0,0,0,1
5,680,65000,0,1,0,0,1,0,0
6,500,25000,1,0,0,1,0,1,0
7,630,55000,0,0,1,0,1,0,0
8,720,90000,0,1,0,0,0,0,1
9,550,40000,1,0,0,1,0,1,0


In [7]:
# Split data into training and testing sets
X = data.drop('default', axis=1)
y = data['default']

In [10]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [13]:
# Fit logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [14]:
# Predict on test data and evaluate accuracy
y_pred = logreg.predict(X_test)
accuracy = logreg.score(X_test, y_test)
print('Accuracy:', accuracy)

Accuracy: 0.8888888888888888


In [None]:

import pandas as pd
from sklearn.linear_model import LogisticRegression

# Load data from csv file
data = pd.read_csv('loan_default_data.csv')

# One-hot encode the 'employment_status' column
one_hot = pd.get_dummies(data['employment_status'])
data = data.drop('employment_status', axis=1)
data = pd.concat([data, one_hot], axis=1)
# One-hot encode the education_level column
one_hot = pd.get_dummies(data['education_level'], prefix='education')
data = pd.concat([data, one_hot], axis=1)
data.drop('education_level', axis=1, inplace=True)

# Split data into training and testing sets
X = data.drop('default', axis=1)
y = data['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Fit logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Predict on test data and evaluate accuracy
y_pred = logreg.predict(X_test)
accuracy = logreg.score(X_test, y_test)
print('Accuracy:', accuracy)

# Ridge regression
Ridge regression is a type of linear regression that adds a penalty term to the loss function to discourage the model from overfitting the data.

Example of ridge regression: Suppose a real estate company wants to predict the price of a house based on its size, number of bedrooms, number of bathrooms, and location. The company has collected data on house prices and these four characteristics for a sample of houses. However, the relationship between these variables is complex, and some of the independent variables may be highly correlated with each other.

In [15]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [16]:
# Load data from csv file
data = pd.read_csv('real_estate_data.csv')

In [17]:
# One-hot encode the 'location' column
one_hot = pd.get_dummies(data['location'])
data = data.drop('location', axis=1)
data = pd.concat([data, one_hot], axis=1)


In [18]:
# Split data into training and testing sets
X = data.drop('price', axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [19]:
# Fit Ridge regression model
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

In [20]:
# Predict on test data and evaluate performance
y_pred = ridge.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('MSE:', mse)

MSE: 29068284365.06766


In [None]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Load data from csv file
data = pd.read_csv('real_estate_data.csv')

# One-hot encode the 'location' column
one_hot = pd.get_dummies(data['location'])
data = data.drop('location', axis=1)
data = pd.concat([data, one_hot], axis=1)

# Split data into training and testing sets
X = data.drop('price', axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Fit Ridge regression model
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Predict on test data and evaluate performance
y_pred = ridge.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('MSE:', mse)

# Lasso regression
Lasso regression is another type of linear regression that adds a penalty term to the loss function, but it encourages the model to select a subset of the independent variables that are most relevant for prediction.

Example of lasso regression: Suppose a marketing research firm wants to predict the sales of a product based on its advertising budget, number of promotions, and store location. The firm has collected data on sales and these three characteristics for a sample of products. However, some of the independent variables may not be important for predicting sales, and there may be too many variables to consider.

In this example, the sales would be the dependent variable, and the advertising budget, number of promotions, and store location would be the independent variables. To handle a large number of independent variables and select only the most important ones for predicting sales, the marketing research firm can fit a lasso regression model.

In [21]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [22]:
# load data
data = pd.read_csv('product_sales_data.csv')

In [23]:
# split data into input (X) and output (y) variables
X = data[['advertising_budget', 'num_promotions', 'store_location']]
y = data['sales']

In [24]:
# convert categorical variable to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['store_location'])

In [25]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [26]:
# create Lasso model and fit to training data
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

In [27]:
# make predictions on testing data and calculate mean squared error
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

In [28]:
# print model coefficients and mean squared error
print('Model coefficients:', lasso.coef_)
print('Mean squared error:', mse)

Model coefficients: [  0.30564869  52.39163013   0.         -18.23477385   6.51680649]
Mean squared error: 937.3024982152765


In [29]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# load data
data = pd.read_csv('product_sales_data.csv')

# split data into input (X) and output (y) variables
X = data[['advertising_budget', 'num_promotions', 'store_location']]
y = data['sales']

# convert categorical variable to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['store_location'])

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# create Lasso model and fit to training data
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

# make predictions on testing data and calculate mean squared error
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# print model coefficients and mean squared error
print('Model coefficients:', lasso.coef_)
print('Mean squared error:', mse)

Model coefficients: [  0.30564869  52.39163013   0.         -18.23477385   6.51680649]
Mean squared error: 937.3024982152765
