<a href="https://colab.research.google.com/github/Aniekanukpono/Group_Q_Mini_Project/blob/main/Group_Q_mini_laptop_price_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ydata_profiling

Import Libraries needed

In [None]:

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from ydata_profiling import ProfileReport

In [None]:
# load dataset from the GitHub repository
df = pd.read_csv("https://raw.githubusercontent.com/Aniekanukpono/Group_Q_Mini_Project/471a105b944226b0d821b0c6cde0081f11d26004/laptop_price.csv", encoding='latin-1')

Dataset Overview

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include = "object")

Data Cleaning And Analysis

In [None]:
# Dropping the laptop_ID
df = df.drop(columns= "ï»¿laptop_ID")
df.head()

In [None]:
# Change columns name making it easily readable
df = df.rename(columns = {"Price_euros": "Price", "OpSys": "Operating_System"})

In [None]:
# Remove duplicate samples
df = df.drop_duplicates()
df.head()

In [None]:
df.isnull().sum()

In [None]:
# Filling null value with the most frequent value (mode) for 'Gpu' and median for 'Weight'
df['Gpu'] = df['Gpu'].fillna(df['Gpu'].mode()[0])
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)
df['Weight'] = df['Weight'].fillna(df['Weight'].median())

df.isnull().sum()

In [None]:
# Value count of columns
for column in df.columns:
  print(f"{df[column].value_counts()} \n")

In [None]:
# Countplot for all non-numeric columns

for column in df.select_dtypes(include= "object"):
 plt.figure(figsize = (20,5))
 sns.countplot(data= df, x =column)
 plt.title(f"Count Plot For {column}".capitalize())
 plt.xticks(rotation = 90)
 plt.show()

In [None]:
# Barplot for all non-numeric columns by Price

for column in df.select_dtypes(include= "object"):
 plt.figure(figsize = (20,5))
 sns.barplot(data= df, x =column, y= "Price")
 plt.title(f"Bar Plot For {column} Vs Price in Euros(£)".capitalize())
 plt.xticks(rotation = 90)
 plt.show()

In [None]:
df.info()

Creation And Training Models

In [None]:
x = df.drop(columns = ["Price", "Inches", "Weight"], axis=1 ) # Dropping columns that are not needed
y = df["Price"] # target variable

In [None]:
x_train, x_test, y_train, y_test = split(x, y, test_size= 0.35, random_state=0)

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
# Identify categorical and numeric columns
categorical_columns = x.select_dtypes(include=['object']).columns
numerical_columns = x.select_dtypes(exclude=['object']).columns

In [None]:
# Transformer using encode categorical and scale numeric
preprocessor = ColumnTransformer(
    transformers=[
        ('category', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('number', StandardScaler(), numerical_columns)
    ]
)

In [None]:
# Pipeline: preprocessing + model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# Train
model.fit(x_train, y_train)

In [None]:
# Evaluate model
predictions = model.predict(x_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error:", mse)
print("R² Score:", r2)

Create data report using Ydata

In [None]:
report = ProfileReport(df, title='Group Q Project Report',explorative=True)

In [None]:
report.to_file("Group Q Project Report.html")