In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [16]:
# Load data
df = pd.read_csv(r'C:\Users\dines\OneDrive\Desktop\job preparation\eli tech\Global_AI_Content_Impact_Dataset.csv')
print("=== Original Dataset (First 5 Rows) ===")
print(df.head())

=== Original Dataset (First 5 Rows) ===
       Country  Year    Industry  AI Adoption Rate (%)  \
0  South Korea  2022       Media                 44.29   
1        China  2025       Legal                 34.75   
2          USA  2022  Automotive                 81.06   
3       France  2021       Legal                 85.24   
4       France  2021      Gaming                 78.95   

   AI-Generated Content Volume (TBs per year)  Job Loss Due to AI (%)  \
0                                       33.09                   16.77   
1                                       66.74                   46.89   
2                                       96.13                   10.66   
3                                       93.76                   27.70   
4                                       45.62                   17.45   

   Revenue Increase Due to AI (%)  Human-AI Collaboration Rate (%)  \
0                           46.12                            74.79   
1                           52.4

In [17]:
# Preprocessing: Encode categorical variables
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(df[["Country", "Industry"]]).toarray()
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(["Country", "Industry"]))

print("\n=== Encoded Categorical Features (First 5 Rows) ===")
print(encoded_df.head())


=== Encoded Categorical Features (First 5 Rows) ===
   Country_Australia  Country_Canada  Country_China  Country_France  \
0                0.0             0.0            0.0             0.0   
1                0.0             0.0            1.0             0.0   
2                0.0             0.0            0.0             0.0   
3                0.0             0.0            0.0             1.0   
4                0.0             0.0            0.0             1.0   

   Country_Germany  Country_India  Country_Japan  Country_South Korea  \
0              0.0            0.0            0.0                  1.0   
1              0.0            0.0            0.0                  0.0   
2              0.0            0.0            0.0                  0.0   
3              0.0            0.0            0.0                  0.0   
4              0.0            0.0            0.0                  0.0   

   Country_UK  Country_USA  Industry_Automotive  Industry_Education  \
0         

In [18]:
# Combine encoded features with numerical features
X = pd.concat([encoded_df, df[["AI Adoption Rate (%)", "Job Loss Due to AI (%)"]]], axis=1)
y = df["Revenue Increase Due to AI (%)"]

print("\n=== Final Features (X) for Model Training (First 5 Rows) ===")
print(X.head())

print("\n=== Target Variable (y) ===")
print(y.head())


=== Final Features (X) for Model Training (First 5 Rows) ===
   Country_Australia  Country_Canada  Country_China  Country_France  \
0                0.0             0.0            0.0             0.0   
1                0.0             0.0            1.0             0.0   
2                0.0             0.0            0.0             0.0   
3                0.0             0.0            0.0             1.0   
4                0.0             0.0            0.0             1.0   

   Country_Germany  Country_India  Country_Japan  Country_South Korea  \
0              0.0            0.0            0.0                  1.0   
1              0.0            0.0            0.0                  0.0   
2              0.0            0.0            0.0                  0.0   
3              0.0            0.0            0.0                  0.0   
4              0.0            0.0            0.0                  0.0   

   Country_UK  Country_USA  ...  Industry_Finance  Industry_Gaming  \
0 

In [19]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n=== Training Data (X_train) ===")
print(X_train.head())

print("\n=== Testing Data (X_test) ===")
print(X_test.head())

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


=== Training Data (X_train) ===
     Country_Australia  Country_Canada  Country_China  Country_France  \
79                 0.0             0.0            0.0             0.0   
197                0.0             0.0            0.0             1.0   
38                 0.0             0.0            0.0             0.0   
24                 0.0             0.0            0.0             1.0   
122                0.0             0.0            1.0             0.0   

     Country_Germany  Country_India  Country_Japan  Country_South Korea  \
79               0.0            1.0            0.0                  0.0   
197              0.0            0.0            0.0                  0.0   
38               0.0            0.0            0.0                  0.0   
24               0.0            0.0            0.0                  0.0   
122              0.0            0.0            0.0                  0.0   

     Country_UK  Country_USA  ...  Industry_Finance  Industry_Gaming  \
79   

In [20]:
# Evaluate
y_pred = model.predict(X_test)
print("\n=== Model Evaluation ===")
print(f"R² Score: {r2_score(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")


=== Model Evaluation ===
R² Score: -0.29
MSE: 767.79


In [21]:
# Feature Importance
importances = model.feature_importances_
feature_importance = pd.DataFrame({"Feature": X.columns, "Importance": importances})
print("\n=== Top 10 Feature Importances ===")
print(feature_importance.sort_values("Importance", ascending=False).head(10))


=== Top 10 Feature Importances ===
                   Feature  Importance
20    AI Adoption Rate (%)    0.310287
21  Job Loss Due to AI (%)    0.270761
18          Industry_Media    0.045697
15          Industry_Legal    0.032774
6            Country_Japan    0.027392
13         Industry_Gaming    0.027228
2            Country_China    0.024283
17      Industry_Marketing    0.024143
11      Industry_Education    0.024032
9              Country_USA    0.023240
