In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

###  **Lode data**

In [None]:
data_path = r"/content/drive/MyDrive/ML_Datasets/50_Startups.csv"
df = pd.read_csv(data_path)
df.head(10)

In [None]:
# seprit data
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df = x_train.copy()
df['Profit'] = y_train

## Data exploering

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['State'].unique()

In [None]:
# Calculate correlation matrix
corr = df.corr(numeric_only=True)

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='inferno', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

----------

In [None]:
# Features and target
x = df.iloc[:, :-1]
y = df.iloc[:,[-1]]

In [None]:
x.shape, y.shape

In [None]:
type(x), type(y)

## Pipline

In [None]:
num_cols = x.select_dtypes(include=["float64"]).columns
cat_cols = x.select_dtypes(include=['object']).columns

In [None]:
# start pipline
num_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())
])

cat_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OrdinalEncoder())
])

In [None]:
# Joine the 2 pipline
preprocessor = ColumnTransformer([
    ('num', num_pipline, num_cols), #--------------------.COLMAN
    ('cat', cat_pipline, cat_cols)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

## Train the model

In [None]:
pipeline.fit(x, y)

## predict the model

In [None]:
# predict the model
y_predict = pipeline.predict(x_test)

## Evaluate model

In [None]:
# Evaluate model accuarcy
mse = mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
mae = mean_absolute_error(y_test, y_predict)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(len(y_test)), y_test.values, label='Actual', marker='o')
plt.plot(range(len(y_predict)), y_predict, label='Predicted', marker='x')
plt.title('Actual vs Predicted Profit')
plt.xlabel('Sample Index')
plt.ylabel('Profit')
plt.legend()
plt.show()


In [None]:
# Scatter plot: Actual vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_predict, color='blue', edgecolors='k')
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         'r-', lw=2)  # perfect prediction line

plt.xlabel("Actual Profit")
plt.ylabel("Predicted Profit")
plt.title("Actual vs Predicted Profit")
plt.show()