<h1>Importing the dataset</h1>

In [None]:
import numpy as np
import pandas as pd

d1 = pd.read_csv(r'Dataset_1.csv')
d1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4706 entries, 0 to 4705
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Titles         4706 non-null   object 
 1   AI Impact          4706 non-null   float64
 2   Tasks              4706 non-null   int64  
 3   AI Models          4706 non-null   int64  
 4   AI Workload Ratio  4706 non-null   float64
 5   Domain             4706 non-null   object 
dtypes: float64(2), int64(2), object(2)
memory usage: 220.7+ KB


<p>Since there are no empty values, we'll proceed with further analysis of the data </p>

In [2]:
d1.describe()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,AI Impact,Tasks,AI Models,AI Workload Ratio
count,4706.0,4706.0,4706.0,4706.0
mean,0.303126,400.708032,1817.678071,inf
std,0.182038,311.564781,1086.853037,
min,0.05,1.0,0.0,0.036585
25%,0.15,161.0,1085.25,0.137271
50%,0.25,270.0,1577.5,0.199281
75%,0.4,608.75,2273.0,0.260572
max,0.98,1387.0,5666.0,inf


<h1>Removing rows with 'inf' values</h1>

In [3]:
i = d1[d1['AI Workload Ratio'] == np.inf].index
d1.drop(i, inplace=True)

d1.describe()

#d1.sort_values(by=['AI Workload Ratio'])

Unnamed: 0,AI Impact,Tasks,AI Models,AI Workload Ratio
count,4699.0,4699.0,4699.0,4699.0
mean,0.303279,401.301341,1820.385827,0.204619
std,0.18213,311.416995,1085.393648,0.076547
min,0.05,1.0,1.0,0.036585
25%,0.15,162.0,1087.0,0.137262
50%,0.25,271.0,1578.0,0.198885
75%,0.4,609.0,2277.5,0.26045
max,0.98,1387.0,5666.0,1.0


In [6]:
i = d1[d1['AI Workload Ratio'] >= 0.5].index
d1.drop(i, inplace=True)


In [8]:
d1.sort_values(by='AI Workload Ratio', ascending=False)

Unnamed: 0,Job Titles,AI Impact,Tasks,AI Models,AI Workload Ratio,Domain
3531,Computer Teacher,0.40,562,1344,0.418155,Construction
3563,Er Rn,0.40,214,523,0.409178,Hospitality
2924,Strategy Analyst,0.30,784,1928,0.406639,Data & IT
2320,Sports,0.25,575,1415,0.406360,Supply Chain & Logistics
2099,Sports Director,0.25,672,1656,0.405797,Medical & Healthcare
...,...,...,...,...,...,...
4348,Custodian,0.60,8,151,0.052980,Administrative & Clerical
4371,General Worker,0.60,19,409,0.046455,Supply Chain & Logistics
4370,General Labor,0.60,19,409,0.046455,Medical & Healthcare
4347,Carpet Cleaner,0.60,6,133,0.045113,Data & IT


In [9]:
data = d1

# Update Task Type classification based on broad job title knowledge

def classify_task_type_by_title(job_title):
    # Define task types based on keywords commonly associated with routine and complex tasks
    routine_keywords = ["Clerk", "Technician", "Data Entry", "Customer Service", "Sales", "Support"]
    complex_keywords = ["Manager", "Engineer", "Researcher", "Teacher", "Analyst", "Consultant", "Specialist", "Director"]

    # Check for routine keywords in job title
    if any(keyword in job_title for keyword in routine_keywords):
        return "Routine"
    # Check for complex keywords in job title
    elif any(keyword in job_title for keyword in complex_keywords):
        return "Complex"
    # Default to Routine if not clearly identified as Complex (conservative approach)
    else:
        return "Routine"

# Apply the updated classification to the Task Type column
data['Task Type'] = data['Job Titles'].apply(classify_task_type_by_title)

# Display the modified dataset
data.head()


Unnamed: 0,Job Titles,AI Impact,Tasks,AI Models,AI Workload Ratio,Domain,Task Type
0,Postdoc,0.05,717,3200,0.224062,Law Enforcement,Routine
1,Pre K Teacher,0.05,750,3377,0.222091,Construction,Complex
2,Preschool Teacher,0.05,747,3241,0.230484,Sales & Marketing,Complex
3,Primary Teacher,0.05,740,3331,0.222156,Hospitality,Complex
4,Private Tutor,0.05,829,3513,0.235981,Communication & PR,Routine


In [10]:
# Define a function to reassign domains based on typical job title associations
def reassign_domain(job_title):
    # Define domain mapping based on common job role associations
    if "Teacher" in job_title or "Tutor" in job_title or "Instructor" in job_title:
        return "Education"
    elif "Engineer" in job_title or "Technician" in job_title:
        return "Engineering & Technology"
    elif "Manager" in job_title or "Director" in job_title:
        return "Management & Administration"
    elif "Analyst" in job_title or "Consultant" in job_title:
        return "Business & Consulting"
    elif "Researcher" in job_title or "Postdoc" in job_title:
        return "Research & Academia"
    elif "Customer Service" in job_title or "Support" in job_title:
        return "Customer Service"
    elif "Clerk" in job_title or "Data Entry" in job_title:
        return "Administrative"
    elif "Sales" in job_title or "Marketing" in job_title:
        return "Sales & Marketing"
    elif "Healthcare" in job_title or "Nurse" in job_title or "Doctor" in job_title:
        return "Healthcare"
    elif "Lawyer" in job_title or "Paralegal" in job_title:
        return "Legal Services"
    else:
        return "General"

# Apply the domain reassignment to the Domain column
data['Domain'] = data['Job Titles'].apply(reassign_domain)

# Display the updated dataset with the new Domain column
data.head()


Unnamed: 0,Job Titles,AI Impact,Tasks,AI Models,AI Workload Ratio,Domain,Task Type
0,Postdoc,0.05,717,3200,0.224062,Research & Academia,Routine
1,Pre K Teacher,0.05,750,3377,0.222091,Education,Complex
2,Preschool Teacher,0.05,747,3241,0.230484,Education,Complex
3,Primary Teacher,0.05,740,3331,0.222156,Education,Complex
4,Private Tutor,0.05,829,3513,0.235981,Education,Routine


In [11]:
def categorize_model_sophistication(num_models, workload_ratio, domain):
    if num_models > 3000 and workload_ratio > 0.6:
        return "Advanced"
    elif num_models > 1500 and workload_ratio > 0.3:
        return "Intermediate"
    elif domain in ["Customer Service", "Administrative", "Sales"]:
        return "Basic"
    else:
        return "Intermediate" if num_models > 1500 else "Basic"

# Apply categorization to each row in the dataset
data['Model Sophistication'] = data.apply(
    lambda row: categorize_model_sophistication(row['AI Models'], row['AI Workload Ratio'], row['Domain']), axis=1
)

data.head()

Unnamed: 0,Job Titles,AI Impact,Tasks,AI Models,AI Workload Ratio,Domain,Task Type,Model Sophistication
0,Postdoc,0.05,717,3200,0.224062,Research & Academia,Routine,Intermediate
1,Pre K Teacher,0.05,750,3377,0.222091,Education,Complex,Intermediate
2,Preschool Teacher,0.05,747,3241,0.230484,Education,Complex,Intermediate
3,Primary Teacher,0.05,740,3331,0.222156,Education,Complex,Intermediate
4,Private Tutor,0.05,829,3513,0.235981,Education,Routine,Intermediate


In [16]:
data.to_csv('processed_dataset.csv', index=False, index_label=False, columns=[
    'Job Titles', 'AI Impact', 'Tasks', 'Task Type', 'AI Models', 'Model Sophistication', 'AI Workload Ratio', 'Domain'])

<h1>Observations from Summary Statistics</h1>
<p>AI Impact: The values range from 0.05 to 0.98, with an average of around 0.30, indicating moderate AI influence across roles on average. However, the maximum value of 0.98 suggests some roles are almost entirely impacted by AI.</p>
<p>Tasks: Task counts vary widely, from 1 to 1387, with a mean of approximately 401, suggesting significant variation in task complexity across roles.</p>
<p>AI Models: The number of AI models ranges from 1 to 5666, with an average of around 1820, hinting that some roles rely heavily on multiple AI models.</p>
<p>AI Workload Ratio: The workload ratio varies from 0.04 to 1.00, averaging around 0.20. This indicates a general trend where AI models contribute to about 20% of the workload in typical roles, but some roles are highly AI-driven.</p>

<h1>Visualizing distributions across domains</h1>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df = d1

# Set up visual styling
sns.set(style="whitegrid")

# Plot distributions for 'AI Impact', 'Tasks', 'AI Models', and 'AI Workload Ratio'
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle("Distribution of Key Parameters in AI Impact Dataset", fontsize=16)

# AI Impact Distribution
sns.histplot(df['AI Impact'], bins=20, kde=True, ax=axes[0, 0], color='teal')
axes[0, 0].set_title("AI Impact Distribution")

# Tasks Distribution
sns.histplot(df['Tasks'], bins=20, kde=True, ax=axes[0, 1], color='coral')
axes[0, 1].set_title("Tasks Distribution")

# AI Models Distribution
sns.histplot(df['AI Models'], bins=20, kde=True, ax=axes[1, 0], color='slateblue')
axes[1, 0].set_title("AI Models Distribution")

# AI Workload Ratio Distribution
sns.histplot(df['AI Workload Ratio'], bins=20, kde=True, ax=axes[1, 1], color='olive')
axes[1, 1].set_title("AI Workload Ratio Distribution")

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


<h1>Insights from Distributions</h1>
<p>AI Impact: The distribution shows a skew toward lower values, with a higher frequency of jobs experiencing relatively lower AI impact. This suggests that while AI is present in many roles, the majority are not yet heavily impacted.</p>
<p>Tasks: Task counts vary widely, with a peak around lower values but a long tail extending to higher numbers, indicating some roles are more complex or multifaceted than others.</p>
<p>AI Models: This follows a similar distribution to tasks, with a concentration of lower counts but some roles using numerous AI models.</p>
<p>AI Workload Ratio: This shows a notable spread, with many jobs having a lower ratio, indicating that AI takes on a smaller portion of the workload for most roles. However, some roles rely heavily on AI.</p>

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Selecting relevant features for clustering
features = df[['AI Impact', 'Tasks', 'AI Models', 'AI Workload Ratio']]

# Standardizing the features for clustering
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Using the Elbow Method to determine optimal number of clusters
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(features_scaled)
    inertia.append(kmeans.inertia_)

# Plotting the inertia to find the "elbow"
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), inertia, marker='o', color='b')
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()


In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score

# Select features for clustering
features = df[['AI Impact', 'Tasks', 'AI Models', 'AI Workload Ratio']]

# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Initialize and fit KMeans with 2 clusters
kmeans = KMeans(n_clusters=2, random_state=0)
df['Cluster'] = kmeans.fit_predict(features_scaled)

# Visualize the clusters based on 'AI Impact' and 'AI Workload Ratio'
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='AI Impact', y='AI Workload Ratio', hue='Cluster', palette='Set1', style='Cluster')
plt.title("K-Means Clustering (2 Clusters): AI Impact vs. AI Workload Ratio")
plt.xlabel("AI Impact")
plt.ylabel("AI Workload Ratio")
plt.legend(title="Cluster")
plt.show()

# Optional: View cluster centers for insights
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
print("Cluster Centers:\n", cluster_centers)


score = silhouette_score(features_scaled, df['Cluster'])
print("Silhouette Score:", score)



In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score


dbscan = DBSCAN(eps=1, min_samples=3)  # Adjust parameters as needed
dbscan_labels = dbscan.fit_predict(features_scaled)
dbscan_silhouette = silhouette_score(features_scaled, dbscan_labels)
print("DBSCAN Silhouette Score:", dbscan_silhouette)

In [None]:
import numpy as np

# Visualize using PCA
pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_scaled)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=features_pca[:, 0], y=features_pca[:, 1], hue=dbscan_labels, palette="viridis", style=np.where(dbscan_labels == -1, "X", "o"))
plt.title('DBSCAN Clusters with PCA')
plt.show()


In [None]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.5, random_state=0)  # Adjust contamination as needed
outliers = iso_forest.fit_predict(features_scaled) == -1  # -1 labels anomalies

# Filter out anomalies
features_filtered = features_scaled[~outliers]

# Re-run clustering on filtered data
kmeans_filtered = KMeans(n_clusters=2, random_state=0)
filtered_labels = kmeans_filtered.fit_predict(features_filtered)
filtered_silhouette = silhouette_score(features_filtered, filtered_labels)
print("Filtered KMeans Silhouette Score:", filtered_silhouette)


In [None]:
from sklearn.decomposition import PCA

# Reduce dimensions with PCA for visualization
pca = PCA(n_components=2)
features_2d = pca.fit_transform(features_scaled)

# Plot clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(x=features_2d[:, 0], y=features_2d[:, 1], hue=df['Cluster'], palette="viridis")
plt.title('PCA of Clusters')
plt.show()


<h1>Regression Model for predicting AI Impact</h1>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

data = d1

# Prepare data for modeling
X = data[['Tasks', 'AI Models', 'AI Workload Ratio']]
y = data['AI Impact']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

mae, mse, rmse, r2


In [1]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

# Load the dataset
file_path = 'processed_dataset.csv'
data = pd.read_csv(file_path)

# 1. Scatter Plot: AI Impact vs. AI Workload Ratio
scatter_impact_workload = px.scatter(
    data, 
    x="AI Workload Ratio", 
    y="AI Impact",
    color="Domain",
    title="AI Impact vs. AI Workload Ratio",
    labels={"AI Workload Ratio": "AI Workload Ratio", "AI Impact": "AI Impact"}
)

# 2. Scatter Plot: AI Models vs. Tasks
scatter_models_tasks = px.scatter(
    data, 
    x="Tasks", 
    y="AI Models",
    color="Domain",
    title="AI Models vs. Tasks",
    labels={"Tasks": "Number of Tasks", "AI Models": "Number of AI Models"}
)

# 3. Scatter Plot: AI Impact vs. Tasks
scatter_impact_tasks = px.scatter(
    data, 
    x="Tasks", 
    y="AI Impact",
    color="Task Type",
    title="AI Impact vs. Tasks",
    labels={"Tasks": "Number of Tasks", "AI Impact": "AI Impact"}
)

# 4. Box Plot: AI Impact by Task Type
box_impact_task_type = px.box(
    data, 
    x="Task Type", 
    y="AI Impact", 
    color="Task Type",
    title="AI Impact by Task Type",
    labels={"Task Type": "Task Type", "AI Impact": "AI Impact"}
)

# 5. Box Plot: AI Workload Ratio by Domain
box_workload_domain = px.box(
    data, 
    x="Domain", 
    y="AI Workload Ratio", 
    color="Domain",
    title="AI Workload Ratio by Domain",
    labels={"Domain": "Industry Domain", "AI Workload Ratio": "AI Workload Ratio"}
)

# 6. Bar Chart: Average AI Impact by Domain
bar_avg_impact_domain = px.bar(
    data.groupby("Domain")["AI Impact"].mean().reset_index(),
    x="Domain",
    y="AI Impact",
    color="Domain",
    title="Average AI Impact by Domain",
    labels={"Domain": "Industry Domain", "AI Impact": "Average AI Impact"}
)

# 7. Bar Chart: Count of Task Types by Domain
task_type_counts = data.groupby(["Domain", "Task Type"]).size().reset_index(name="Count")
bar_task_type_domain = px.bar(
    task_type_counts,
    x="Domain",
    y="Count",
    color="Task Type",
    barmode="group",
    title="Count of Task Types by Domain",
    labels={"Domain": "Industry Domain", "Count": "Task Count"}
)

# 8. Heatmap: Correlation Matrix
correlation_matrix = data[["AI Impact", "Tasks", "AI Models", "AI Workload Ratio"]].corr()
heatmap_correlation = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='Viridis'
))
heatmap_correlation.update_layout(
    title="Correlation Matrix",
    xaxis_title="Features",
    yaxis_title="Features"
)

# 9. Line Chart: AI Impact Across Model Sophistication
line_impact_sophistication = px.line(
    data.groupby("Model Sophistication")["AI Impact"].mean().reset_index(),
    x="Model Sophistication",
    y="AI Impact",
    title="AI Impact Across Model Sophistication Levels",
    labels={"Model Sophistication": "Model Sophistication", "AI Impact": "Average AI Impact"}
)

# Show plots
scatter_impact_workload.show()
scatter_models_tasks.show()
scatter_impact_tasks.show()
box_impact_task_type.show()
box_workload_domain.show()
bar_avg_impact_domain.show()
bar_task_type_domain.show()
heatmap_correlation.show()
line_impact_sophistication.show()


In [4]:
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import plotly
import sklearn

# Load the dataset
file_path = 'processed_dataset.csv'
data = pd.read_csv(file_path)

# 1. Scatter Plot with Regression Line: AI Workload Ratio vs. AI Impact by Domain
scatter_regression = px.scatter(
    data, 
    x="AI Workload Ratio", 
    y="AI Impact", 
    color="Domain", 
    trendline="ols",
    title="AI Impact vs. AI Workload Ratio by Domain",
    labels={"AI Workload Ratio": "AI Workload Ratio", "AI Impact": "AI Impact"}
)

# 2. Stacked Bar Chart: Proportion of Task Types by Domain
task_type_proportion = data.groupby(["Domain", "Task Type"]).size().reset_index(name="Count")
task_type_total = task_type_proportion.groupby("Domain")["Count"].transform("sum")
task_type_proportion["Proportion"] = task_type_proportion["Count"] / task_type_total

stacked_bar = px.bar(
    task_type_proportion, 
    x="Domain", 
    y="Proportion", 
    color="Task Type", 
    title="Proportion of Task Types by Domain",
    labels={"Proportion": "Proportion of Tasks", "Domain": "Industry Domain"}
)

# 3. Bubble Chart: AI Models vs. Tasks with Bubble Size as AI Impact
bubble_chart = px.scatter(
    data, 
    x="Tasks", 
    y="AI Models", 
    size="AI Impact", 
    color="Domain",
    title="AI Models vs. Tasks with Bubble Size as AI Impact",
    labels={"Tasks": "Number of Tasks", "AI Models": "Number of AI Models"}
)

# 4. Box Plot: AI Workload Ratio by Domain, Differentiated by Task Type
box_workload_tasktype = px.box(
    data, 
    x="Domain", 
    y="AI Workload Ratio", 
    color="Task Type",
    title="AI Workload Ratio by Domain and Task Type",
    labels={"Domain": "Industry Domain", "AI Workload Ratio": "AI Workload Ratio"}
)

# 5. Treemap: Job Roles and AI Impact within Domains
treemap = px.treemap(
    data, 
    path=["Domain", "Job Titles"], 
    values="AI Impact",
    title="Treemap of Job Roles and AI Impact by Domain",
    labels={"AI Impact": "AI Impact"}
)

# 6. Cluster Analysis Visualization: PCA for Dimensionality Reduction
# Normalize numeric columns
scaler = StandardScaler()
numeric_data = data[["AI Impact", "Tasks", "AI Workload Ratio", "AI Models"]]
scaled_data = scaler.fit_transform(numeric_data)

# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)
data["PCA1"] = pca_result[:, 0]
data["PCA2"] = pca_result[:, 1]

pca_scatter = px.scatter(
    data, 
    x="PCA1", 
    y="PCA2", 
    color="Domain", 
    hover_data=["Job Titles"],
    title="Cluster Analysis of Jobs based on PCA",
    labels={"PCA1": "Principal Component 1", "PCA2": "Principal Component 2"}
)

# 7. Facet Plot: AI Impact vs. Tasks, Split by Task Type
facet_plot = px.scatter(
    data, 
    x="Tasks", 
    y="AI Impact", 
    color="Domain", 
    facet_col="Task Type",
    title="AI Impact vs. Tasks by Task Type",
    labels={"Tasks": "Number of Tasks", "AI Impact": "AI Impact"}
)

# 8. Parallel Coordinates Plot: Relationships among Key Features
parallel_plot = px.parallel_coordinates(
    data, 
    dimensions=["Tasks", "AI Impact", "AI Workload Ratio", "AI Models"],
    color="AI Impact",
    color_continuous_scale=px.colors.sequential.Viridis,
    title="Parallel Coordinates Plot of Tasks, AI Impact, and Workload Ratio"
)

# Show the plots
scatter_regression.show()
stacked_bar.show()
bubble_chart.show()
box_workload_tasktype.show()
treemap.show()
pca_scatter.show()
facet_plot.show()
parallel_plot.show()
