# Linear Regression Analysis with scikit-learn

In [None]:

# Install scikit-learn
!pip install scikit-learn


In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [None]:

# Data Cleaning
# Load your dataset (replace 'your_data.csv' with your actual data file)
df = pd.read_csv('your_data.csv')
# Handle missing values
# df.dropna(inplace=True)  # Uncomment if you want to drop missing values
# Ensure data types are appropriate
# df['column_name'] = df['column_name'].astype(float)  # Example of type conversion


In [None]:

# Data Exploration
# Visualize the data
plt.figure(figsize=(10, 6))
plt.scatter(df['independent_variable'], df['dependent_variable'], color='blue')
plt.title('Scatterplot of Independent vs Dependent Variable')
plt.xlabel('Independent Variable')
plt.ylabel('Dependent Variable')
plt.grid(True)
plt.show()



# Hypothesis Statement
# I hypothesize that there is a positive relationship between the independent variable and the dependent variable.


In [None]:

# Reshape Data
X = df[['independent_variable']].values  # Independent variable
Y = df['dependent_variable'].values  # Dependent variable


In [None]:

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:

# Model Fitting
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:

# Prediction
y_pred = model.predict(X_test)


In [None]:

# Visualization of Regression Line
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', label='Predicted', linewidth=2)
plt.title('Regression Line on Test Set')
plt.xlabel('Independent Variable')
plt.ylabel('Dependent Variable')
plt.legend()
plt.grid(True)
plt.show()


In [None]:

# Model Performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R² Score:', r2)


In [None]:

# Comparison of Predicted vs Actual
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison_df)



# Reflection on Model Performance
# The model performed well if the R² score is close to 1 and MSE is low. 
# Consider potential data bias and its impact on the model's predictions.
