<a href="https://colab.research.google.com/github/FarhaKousar1601/-VTUStudyMarkAnalysis/blob/main/%20Data%20Analysis%20and%20Machine%20Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Step 1: Load the Dataset
df = pd.read_csv('/content/TemperatureData.csv')

# Display basic information and the first few rows to understand the data structure
print(df.info())
print(df.head())

# Step 2: Data Cleaning
# Check for missing values and print column names for verification
print("Missing values per column:\n", df.isnull().sum())
print("Columns:", df.columns)

# Ensure column names are consistent by stripping any extra spaces
df.columns = df.columns.str.strip()

# Handle missing values in the 'Temperature' column by filling with the mean
if 'Temperature' in df.columns:
    df['Temperature'].fillna(df['Temperature'].mean(), inplace=True)
else:
    print("Column 'Temperature' not found in dataset.")

# Convert the 'Date' column to datetime format (if it exists) for easier manipulation
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
else:
    print("Column 'Date' not found in dataset.")

# Step 3: String Manipulation (for text columns, if any)
# For example, let's assume a 'Location' column that may need cleaning
if 'Location' in df.columns:
    df['Location'] = df['Location'].str.lower().str.strip()
else:
    print("Column 'Location' not found in dataset.")

# Step 4: Use NumPy for Basic Statistics
# Convert Temperature data to a NumPy array for statistical calculations
if 'Temperature' in df.columns:
    temp_array = np.array(df['Temperature'])
    mean_temp = np.mean(temp_array)
    median_temp = np.median(temp_array)
    print("Mean Temperature:", mean_temp)
    print("Median Temperature:", median_temp)
else:
    print("Column 'Temperature' not found for statistics calculation.")

# Step 5: Feature Engineering and Data Splitting
# Extract Year, Month, and Day from the 'Date' column for additional features
if 'Date' in df.columns:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
else:
    print("Feature engineering on 'Date' column is skipped as it's not found.")

# Define features (X) and target variable (y)
if {'Temperature', 'Year', 'Month', 'Day'}.issubset(df.columns):
    X = df[['Year', 'Month', 'Day']]
    y = df['Temperature']
else:
    print("Required columns for model training are missing.")
    X = y = None

# Proceed with training only if X and y are defined
if X is not None and y is not None:
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 6: Build and Evaluate a Linear Regression Model
    # Initialize and train the linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("Mean Absolute Error (MAE):", mae)
    print("R-squared Score:", r2)
else:
    print("Skipping model training due to missing required columns.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Year          565 non-null    int64 
 1   Month         565 non-null    object
 2   Day           565 non-null    int64 
 3   TemperatureF  565 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 17.8+ KB
None
   Year    Month  Day  TemperatureF
0  2015  January    1            23
1  2015  January    2            31
2  2015  January    3            25
3  2015  January    4            39
4  2015  January    5            29
Missing values per column:
 Year            0
Month           0
Day             0
TemperatureF    0
dtype: int64
Columns: Index(['Year', 'Month', 'Day', 'TemperatureF'], dtype='object')
Column 'Temperature' not found in dataset.
Column 'Date' not found in dataset.
Column 'Location' not found in dataset.
Column 'Temperature' not found for statistics calculation.
Feature 