In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("lightcast_job_postings.csv")
df = df[df['BODY'].notnull()]  

X_text = df['BODY']
y = df['LOT_SPECIALIZED_OCCUPATION_NAME']  

tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf.fit_transform(X_text)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
                                   precision    recall  f1-score   support

      Business Analyst (General)       0.74      0.62      0.67      1355
   Business Intelligence Analyst       0.86      0.71      0.77      1116
                    Data Analyst       0.85      0.95      0.90      8326
            Data Quality Analyst       0.76      0.56      0.64       326
            Enterprise Architect       0.97      0.94      0.95      2458
          Financial Data Analyst       0.79      0.53      0.63       333
General ERP Analyst / Consultant       0.84      0.83      0.84      3014
              Healthcare Analyst       0.88      0.51      0.64        75
               Marketing Analyst       0.78      0.13      0.23        53
     Oracle Consultant / Analyst       0.94      0.92      0.93      2406
             SAP Analyst / Admin       0.79      0.76      0.77      2281

                        accuracy                           0.86     21743
            

: 

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

ai_keywords = ['ai', 'artificial', 'machine learning', 'deep learning', 
               'data scientist', 'nlp', 'computer vision', 'ml', 'llm', 
               'analyst', 'engineer', 'developer']


def is_ai_job(text):
    if pd.isnull(text):
        return 0
    return int(any(kw.lower() in text.lower() for kw in ai_keywords))

df['IS_AI_ROLE'] = df['LOT_SPECIALIZED_OCCUPATION_NAME'].apply(is_ai_job)

ai_text = ' '.join(df[df['IS_AI_ROLE'] == 1]['BODY'].dropna())
nonai_text = ' '.join(df[df['IS_AI_ROLE'] == 0]['BODY'].dropna())

ai_wc = WordCloud(width=800, height=400, background_color='white', stopwords='english').generate(ai_text)
nonai_wc = WordCloud(width=800, height=400, background_color='white', stopwords='english').generate(nonai_text)

plt.figure(figsize=(10, 5))
plt.imshow(ai_wc, interpolation='bilinear')
plt.axis('off')
plt.title('AI Job Word Cloud')
plt.show()

plt.figure(figsize=(10, 5))
plt.imshow(nonai_wc, interpolation='bilinear')
plt.axis('off')
plt.title('Non-AI Job WordCloud')
plt.show()