In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
url = "https://raw.githubusercontent.com/datasets/gdp/master/data/gdp.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Check for missing values
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Get summary statistics of the dataset
print("\nSummary Statistics:")
print(df.describe())

# Visualize the distribution of GDP data (example for a specific country, like 'United States')
plt.figure(figsize=(10, 6))
sns.lineplot(data=df[df['Country Name'] == 'United States'], x='Year', y='Value')
plt.title('GDP of United States over the Years')
plt.xlabel('Year')
plt.ylabel('GDP Value')
plt.show()

# Drop rows with missing GDP values (if any)
df_cleaned = df.dropna(subset=['Value'])

# Verify cleaned data
print("\nCleaned Data:")
print(df_cleaned.head())

# Define features and target variable (predict 'Value' based on 'Year')
X = df_cleaned[['Year']]  # Predictor
y = df_cleaned['Value']   # Target

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"\nMean Squared Error: {mse:.2f}")
print(f"R2 Score: {r2:.2f}")

# Visualize the predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', label='Predicted')
plt.title('Actual vs Predicted GDP')
plt.xlabel('Year')
plt.ylabel('GDP Value')
plt.legend()
plt.show()

# Example prediction: Predict GDP for the year 2025
prediction_year = np.array([[2025]])  # Input data for year 2025
gdp_prediction = model.predict(prediction_year)
print(f"\nPredicted GDP for the year 2025: {gdp_prediction[0]}")
