In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Install the necessary packages
!pip install PyMuPDF
!pip install prophet

# Import the necessary libraries
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA  # Updated to statsmodels.tsa.arima.model.ARIMA
from prophet import Prophet  # Updated import statement

# Mount Google Drive if running in Colab
from google.colab import drive
drive.mount('/content/drive')

# Path to the PDF file
pdf_path = '/content/drive/MyDrive/Unemployment in India.pdf'

# Read the PDF file
doc = fitz.open(pdf_path)
text = ""
for page in doc:
    text += page.get_text()

# Check the first part of the text to understand its structure
print(text[:1000])

# Process the text to create a DataFrame
lines = text.split('\n')
data = [line.split(',') for line in lines if line]

# Ensure the data is in the expected tabular format
if len(data) > 0 and len(data[0]) > 1:
    columns = data[0]
    data_rows = data[1:]
    df = pd.DataFrame(data_rows, columns=columns)

    # Print the first few rows to verify the content
    print(df.head())

    # Ensure the Date column is correctly parsed
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df.dropna(subset=['Date'], inplace=True)  # Drop rows where date conversion failed
    else:
        raise ValueError("The 'Date' column is not found in the data")

    # Save DataFrame to CSV
    csv_path = 'unemployment_data.csv'
    df.to_csv(csv_path, index=False)

    # Load the data
    data = pd.read_csv(csv_path, parse_dates=['Date'], index_col='Date')

    # Data preprocessing
    data.fillna(method='ffill', inplace=True)  # Example of handling missing values

    # Exploratory Data Analysis
    print(data.describe())
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=data, x='Date', y='Unemployment_Rate')
    plt.title('Unemployment Rate Over Time')
    plt.show()

    # Time Series Decomposition
    decomposition = seasonal_decompose(data['Unemployment_Rate'], model='additive')
    decomposition.plot()
    plt.show()

    # ARIMA Model
    model = ARIMA(data['Unemployment_Rate'], order=(5, 1, 0))
    model_fit = model.fit(disp=0)
    print(model_fit.summary())

    # Forecasting with Prophet
    df = data.reset_index().rename(columns={'Date': 'ds', 'Unemployment_Rate': 'y'})
    model = Prophet()
    model.fit(df)
    future = model.make_future_dataframe(periods=365)
    forecast = model.predict(future)
    fig = model.plot(forecast)
    plt.show()
else:
    print("The data extracted from the PDF does not seem to be in the expected format.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Region
Date
Frequency
Estimated Unemployment Rate (%)
Estimated Employed
Estimated Labour Participation Rate (%)
Area
Andhra Pradesh
31-05-2019
Monthly
3.65
11999139
43.24 Rural
Andhra Pradesh
30-06-2019
Monthly
3.05
11755881
42.05 Rural
Andhra Pradesh
31-07-2019
Monthly
3.75
12086707
43.5 Rural
Andhra Pradesh
31-08-2019
Monthly
3.32
12285693
43.97 Rural
Andhra Pradesh
30-09-2019
Monthly
5.17
12256762
44.68 Rural
Andhra Pradesh
31-10-2019
Monthly
3.52
12017412
43.01 Rural
Andhra Pradesh
30-11-2019
Monthly
4.12
11397681
41 Rural
Andhra Pradesh
31-12-2019
Monthly
4.38
12528395
45.14 Rural
Andhra Pradesh
31-01-2020
Monthly
4.84
12016676
43.46 Rural
Andhra Pradesh
29-02-2020
Monthly
5.91
11723617
42.83 Rural
Andhra Pradesh
31-03-2020
Monthly
4.06
11359660
40.66 Rural
Andhra Pradesh
30-04-2020
Monthly
16.29
8792827
36.03 Rural
Andhra Pradesh
31-05-2020
Monthly
14.