In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates

In [5]:
# Reading CSV
df = pd.read_csv("CarSharing.csv")

# Converting categories to numbers
categorical_columns = ['season', 'holiday', 'workingday', 'weather']
for column in categorical_columns:
    df[column] = df[column].astype('category').cat.codes

# Identifying duplicate Rows
duplicates = df.duplicated()
num_duplicates = duplicates.sum()
print(f"\nNumber of duplicate rows: {num_duplicates}")

# Counting Missing Values by Row
rows_with_multiple_missing = 0
for _, row in df.iterrows():
    num_missing = row.isnull().sum()
    if num_missing > 1:
        rows_with_multiple_missing += 1
print(f"\nNumber of rows with more than 1 missing value: {rows_with_multiple_missing}")

# Counting Missing Values Column
missing_values_per_column = df.isnull().sum()
print("\n")
for column, num_missing in missing_values_per_column.items():
    print(f"Column '{column}' has {num_missing} missing values")

# Interpolating Missing Data for Specific Columns
columns_to_interpolate = ['temp', 'temp_feel', 'humidity', 'windspeed']
for column in columns_to_interpolate:
    df[column] = df[column].interpolate(method='linear', limit_direction='forward', axis=0)

# New CSV is created
df.to_csv('CarSharing_Preprocessed.csv', index=False)



Number of duplicate rows: 0

Number of rows with more than 1 missing value: 20


Column 'id' has 0 missing values
Column 'timestamp' has 0 missing values
Column 'season' has 0 missing values
Column 'holiday' has 0 missing values
Column 'workingday' has 0 missing values
Column 'weather' has 0 missing values
Column 'temp' has 1202 missing values
Column 'temp_feel' has 102 missing values
Column 'humidity' has 39 missing values
Column 'windspeed' has 200 missing values
Column 'demand' has 0 missing values
