In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

In [None]:

file_path = 'all_upwork_jobs_2024-02-07-2024-03-24.csv'
df = pd.read_csv(file_path)

print(df.shape)
df.head()

df.info()
df.isnull().sum()

In [None]:
# -------------------------------
# 4. Data Cleaning & Preprocessing
# -------------------------------

# Drop rows without salary
df = df.dropna(subset=['hourly_low', 'hourly_high'])

# Average hourly salary
df['avg_hourly_rate'] = (df['hourly_low'] + df['hourly_high']) / 2

# Parse date (CORRECT column name)
df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
df = df.dropna(subset=['published_date'])

# Monthly grouping
df['month'] = df['published_date'].dt.to_period('M')

# Clean job title
df['job_title_clean'] = (
    df['title']
    .str.lower()
    .str.replace('[^a-zA-Z ]', '', regex=True)
)

# Derived job category (since CSV has no job_category)
df['job_category'] = (
    df['job_title_clean']
    .str.split()
    .str[:2]
    .str.join(' ')
)

# Remote flag (derived from title, since work_type column does NOT exist)
df['is_remote'] = df['title'].str.contains('remote', case=False).astype(int)

# Country cleanup
df['country'] = df['country'].fillna('Unknown')


In [None]:
# -------------------------------
# 5. Exploratory Data Analysis (EDA)
# -------------------------------

plt.figure()
sns.histplot(df['avg_hourly_rate'], bins=40)
plt.title('Hourly Rate Distribution')
plt.show()

country_salary = df.groupby('country')['avg_hourly_rate'].mean().sort_values(ascending=False).head(10)

plt.figure()
country_salary.plot(kind='bar')
plt.title('Top Countries by Average Hourly Rate')
plt.show()

# -------------------------------

In [None]:
# 6. Task 1: Job Title Keywords vs Salary (TF-IDF)
# -------------------------------

tfidf = TfidfVectorizer(max_features=100, stop_words='english')
X_tfidf = tfidf.fit_transform(df['job_title_clean'])

salary_model = RandomForestRegressor(n_estimators=200, random_state=42)
salary_model.fit(X_tfidf, df['avg_hourly_rate'])

feature_importance = pd.Series(salary_model.feature_importances_, index=tfidf.get_feature_names_out())
feature_importance.sort_values(ascending=False).head(15)


In [None]:
# -------------------------------
# 7. Task 2: Emerging Job Categories
# -------------------------------

df['month'] = df['date_posted'].dt.to_period('M')
category_trends = df.groupby(['month', 'job_category']).size().unstack(fill_value=0)
category_growth = category_trends.pct_change().mean().sort_values(ascending=False)

category_growth.head(10)

In [None]:
# -------------------------------
# 8. Task 3: High-Demand Job Role Prediction
# -------------------------------

role_demand = df.groupby(['month', 'job_category']).size().reset_index(name='count')

encoder = LabelEncoder()
role_demand['job_encoded'] = encoder.fit_transform(role_demand['job_category'])
role_demand['month_num'] = role_demand['month'].astype(str).str.replace('-', '').astype(int)

X = role_demand[['job_encoded', 'month_num']]
y = role_demand['count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

demand_model = GradientBoostingRegressor(n_estimators=300)
demand_model.fit(X_train, y_train)

y_pred = demand_model.predict(X_test)

print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
 -------------------------------
# 9. Task 4: Country-wise Hourly Rate Comparison
# -------------------------------

country_rates = df.groupby('country')['avg_hourly_rate'].mean().sort_values(ascending=False)

plt.figure()
country_rates.head(10).plot(kind='bar')
plt.title('Average Hourly Rate by Country')
plt.show()


In [None]:

#
# -------------------------------
# 10. Task 5: Job Recommendation Engine
# -------------------------------

tfidf_rec = TfidfVectorizer(stop_words='english')
job_matrix = tfidf_rec.fit_transform(df['job_title_clean'])

similarity_matrix = cosine_similarity(job_matrix)


def recommend_jobs(job_index, top_n=5):
    scores = list(enumerate(similarity_matrix[job_index]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    job_indices = [i[0] for i in scores[1:top_n+1]]
    return df.iloc[job_indices][['title', 'avg_hourly_rate', 'country']]

recommend_jobs(10)


In [None]:
# -------------------------------
# 11. Task 6: Monthly Job Market Dynamics
# -------------------------------

monthly_jobs = df.groupby('month').size()

plt.figure()
monthly_jobs.plot()
plt.title('Monthly Job Posting Trend')
plt.show()

# -------------------------------

In [None]:

# 12. Task 7: Remote Work Trends
# -------------------------------

remote_trend = df.groupby('month')['is_remote'].mean()

plt.figure()
remote_trend.plot()
plt.title('Remote Work Trend Over Time')
plt.show()


In [None]:
# -------------------------------
# 13. Task 8: Future Job Market Prediction
# -------------------------------

future_X = X_test.copy()
future_predictions = demand_model.predict(future_X)

pd.DataFrame({
    'Predicted_Demand': future_predictions
}).head()

# -------------------------------
# 14. Final Conclusion
# -------------------------------

print('Project Completed Successfully')