In [7]:
import pandas as pd

# Replace 'weather_data.csv' with the path to your dataset file
file_path = 'C:\\Users\\HP\\Desktop\\weatherHistory.csv'

# Load the dataset
try:
    data = pd.read_csv(file_path)
    print("Dataset imported successfully!\n")
    
    # Display the first few rows of the dataset
    print("Preview of the dataset:")
    print(data.head(), "\n")
    
    # Display column details
    print("Column details:")
    print(data.info())
    
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Dataset imported successfully!

Preview of the dataset:
                  Formatted Date        Summary Precip Type  Temperature (C)  \
0  2006-04-01 00:00:00.000 +0200  Partly Cloudy        rain         9.472222   
1  2006-04-01 01:00:00.000 +0200  Partly Cloudy        rain         9.355556   
2  2006-04-01 02:00:00.000 +0200  Mostly Cloudy        rain         9.377778   
3  2006-04-01 03:00:00.000 +0200  Partly Cloudy        rain         8.288889   
4  2006-04-01 04:00:00.000 +0200  Mostly Cloudy        rain         8.755556   

   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
0                  7.388889      0.89            14.1197   
1                  7.227778      0.86            14.2646   
2                  9.377778      0.89             3.9284   
3                  5.944444      0.83            14.1036   
4                  6.977778      0.83            11.0446   

   Wind Bearing (degrees)  Visibility (km)  Loud Cover  Pressure (millibars)  \
0                   25

In [8]:
# Check unique values in each column
print("Unique values in each column:\n")
for column in data.columns:
    unique_values = data[column].unique()
    print(f"Column: {column}")
    print(f"Number of unique values: {len(unique_values)}")
    print(f"Sample unique values: {unique_values[:27]}")  # Show the first 10 unique values
    print("-" * 40)


Unique values in each column:

Column: Formatted Date
Number of unique values: 96429
Sample unique values: ['2006-04-01 00:00:00.000 +0200' '2006-04-01 01:00:00.000 +0200'
 '2006-04-01 02:00:00.000 +0200' '2006-04-01 03:00:00.000 +0200'
 '2006-04-01 04:00:00.000 +0200' '2006-04-01 05:00:00.000 +0200'
 '2006-04-01 06:00:00.000 +0200' '2006-04-01 07:00:00.000 +0200'
 '2006-04-01 08:00:00.000 +0200' '2006-04-01 09:00:00.000 +0200'
 '2006-04-01 10:00:00.000 +0200' '2006-04-01 11:00:00.000 +0200'
 '2006-04-01 12:00:00.000 +0200' '2006-04-01 13:00:00.000 +0200'
 '2006-04-01 14:00:00.000 +0200' '2006-04-01 15:00:00.000 +0200'
 '2006-04-01 16:00:00.000 +0200' '2006-04-01 17:00:00.000 +0200'
 '2006-04-01 18:00:00.000 +0200' '2006-04-01 19:00:00.000 +0200'
 '2006-04-01 20:00:00.000 +0200' '2006-04-01 21:00:00.000 +0200'
 '2006-04-01 22:00:00.000 +0200' '2006-04-01 23:00:00.000 +0200'
 '2006-04-10 00:00:00.000 +0200' '2006-04-10 01:00:00.000 +0200'
 '2006-04-10 02:00:00.000 +0200']
--------------

In [9]:
# Drop the specified columns
data = data.drop(columns=['Formatted Date', 'Loud Cover','Daily Summary'], errors='ignore')

# Verify the columns are removed
print("Columns after removal:")
print(data.columns)


Columns after removal:
Index(['Summary', 'Precip Type', 'Temperature (C)', 'Apparent Temperature (C)',
       'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)',
       'Visibility (km)', 'Pressure (millibars)'],
      dtype='object')


In [10]:
data=data.dropna(subset=['Precip Type'])
 # Display column details
print("Column details:")
print(data.info())

Column details:
<class 'pandas.core.frame.DataFrame'>
Index: 95936 entries, 0 to 96452
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Summary                   95936 non-null  object 
 1   Precip Type               95936 non-null  object 
 2   Temperature (C)           95936 non-null  float64
 3   Apparent Temperature (C)  95936 non-null  float64
 4   Humidity                  95936 non-null  float64
 5   Wind Speed (km/h)         95936 non-null  float64
 6   Wind Bearing (degrees)    95936 non-null  float64
 7   Visibility (km)           95936 non-null  float64
 8   Pressure (millibars)      95936 non-null  float64
dtypes: float64(7), object(2)
memory usage: 7.3+ MB
None


In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib

# Encode categorical columns
label_encoder = LabelEncoder()


# Assuming 'Summary' and 'Precip Type' are the categorical columns
data['Summary'] = label_encoder.fit_transform(data['Summary'])
data['Precip Type'] = label_encoder.fit_transform(data['Precip Type'])


X = data.drop(columns=['Temperature (C)'])
y = data['Temperature (C)']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'temperature_model.pkl')

print("Model trained and saved!")

Model trained and saved!
