In [18]:
import pandas as pd
import re
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the CSV file
file_path = "C:\\Users\\ASUS\\Desktop\\internship.csv"
data = pd.read_csv(file_path)

# Improved stipend cleaning function
def clean_stipend(stipend):
    if isinstance(stipend, str):
        stipend = stipend.replace('₹', '').replace('/month', '').replace(',', '').strip()
        match = re.search(r'\d+', stipend)
        if match:
            stipend = int(match.group(0))
        else:
            stipend = 0  # Handle cases like "Unpaid"
    return stipend

data['stipend'] = data['stipend'].apply(clean_stipend)

# Ensure 'duration' column is properly cleaned
def clean_duration(duration):
    if isinstance(duration, str):
        match = re.search(r'\d+', duration)
        if match:
            duration = int(match.group(0))
        else:
            duration = 0
    return duration

data['duration'] = data['duration'].apply(clean_duration)

# Display basic info and check for missing values
print(data.info())

# Display summary statistics
print(data.describe(include='all'))

# Feature Engineering
data['start_date'] = pd.to_datetime(data['start_date'], errors='coerce').fillna(pd.Timestamp('1900-01-01'))
data['start_month'] = data['start_date'].dt.month

# Define target variable
data['paid'] = data['stipend'].apply(lambda x: 1 if x > 0 else 0)

# Select features for the model
features = ['internship_title', 'company_name', 'location', 'duration', 'start_month']
X = data[features]
y = data['paid']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical variables using only the training data
label_encoders = {}
for column in ['internship_title', 'company_name', 'location']:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = X_test[column].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    label_encoders[column] = le

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

# Feature Importance Analysis
feature_importances = pd.DataFrame(model.feature_importances_, index=features, columns=['importance']).sort_values('importance', ascending=False)
print("\nFeature Importances:")
print(feature_importances)

# Generate Random Recommendations/Solutions
def generate_random_recommendations(num_recommendations=5):
    recommendations = []
    for _ in range(num_recommendations):
        random_feature_values = {
            'internship_title': np.random.choice(X_train['internship_title']),
            'company_name': np.random.choice(X_train['company_name']),
            'location': np.random.choice(X_train['location']),
            'duration': np.random.choice(data['duration']),
            'start_month': np.random.choice(data['start_month']),
        }
        features_df = pd.DataFrame([random_feature_values])
        prediction = model.predict(features_df)[0]
        recommendation = {
            'internship_title': label_encoders['internship_title'].inverse_transform([random_feature_values['internship_title']])[0],
            'company_name': label_encoders['company_name'].inverse_transform([random_feature_values['company_name']])[0],
            'location': label_encoders['location'].inverse_transform([random_feature_values['location']])[0],
            'duration': random_feature_values['duration'],
            'start_month': random_feature_values['start_month'],
            'predicted_paid': 'Paid' if prediction == 1 else 'Unpaid'
        }
        recommendations.append(recommendation)
    return recommendations

# Generate and display random recommendations
random_recommendations = generate_random_recommendations()
print("\nRandom Recommendations/Solutions to Hindrances in Obtaining Internships:")
for rec in random_recommendations:
    print(rec)

# Generate Diverse Actionable Solutions
def generate_diverse_solutions(num_solutions=5):
    solutions = []
    possible_actions = [
        lambda: f"Focus on offering more internships with the title '{label_encoders['internship_title'].inverse_transform([random.choice(X_train['internship_title'].value_counts().index[:5])])[0]}'.",
        lambda: f"Encourage companies like '{label_encoders['company_name'].inverse_transform([random.choice(X_train['company_name'].value_counts().index[:5])])[0]}' to provide more internships.",
        lambda: f"Increase internship opportunities in locations like '{label_encoders['location'].inverse_transform([random.choice(X_train['location'].value_counts().index[:5])])[0]}'.",
        lambda: f"Consider offering internships with an average duration of {data['duration'].mean():.0f} months.",
        lambda: f"Promote internships starting in month {random.choice(data['start_month'].value_counts().index[:5])}.",
        lambda: f"Introduce flexible internship durations to attract more candidates.",
        lambda: f"Increase the stipend for internships in less popular locations to attract more applications.",
        lambda: f"Provide remote work options to increase the pool of applicants.",
        lambda: f"Partner with universities to offer credit for internships."
    ]
    
    selected_actions = random.sample(possible_actions, num_solutions)
    for action in selected_actions:
        solutions.append(action())
    return solutions

solutions = generate_diverse_solutions()
print("\nDiverse Solutions to Increase Internship Opportunities:")
for solution in solutions:
    print(solution)

# Predict Chances of Getting an Internship Based on User Input
def predict_internship_chances(internship_title, company_name, location, duration, start_month):
    # Handle unseen labels
    def get_encoded_value(label_encoder, value, default_value):
        if value in label_encoder.classes_:
            return label_encoder.transform([value])[0]
        else:
            print(f"Warning: '{value}' not seen before. Using default '{default_value}' instead.")
            return label_encoder.transform([default_value])[0]
    
    # Get default values for unseen labels
    default_title = data['internship_title'].mode()[0]
    default_company = data['company_name'].mode()[0]
    default_location = data['location'].mode()[0]
    
    # Encode user input with fallback to default values
    encoded_title = get_encoded_value(label_encoders['internship_title'], internship_title, default_title)
    encoded_company = get_encoded_value(label_encoders['company_name'], company_name, default_company)
    encoded_location = get_encoded_value(label_encoders['location'], location, default_location)
    
    # Create a DataFrame with the encoded input
    user_input = pd.DataFrame([{
        'internship_title': encoded_title,
        'company_name': encoded_company,
        'location': encoded_location,
        'duration': duration,
        'start_month': start_month
    }])
    
    # Predict probability
    prediction_prob = model.predict_proba(user_input)[0][1]  # Probability of being a paid internship
    return prediction_prob

# Example user input
user_internship_title = 'Web Development Intern'
user_company_name = 'TCS'
user_location = 'Mumbai'
user_duration = 3
user_start_month = 1

# Predict and display the chances of getting an internship
chances = predict_internship_chances(user_internship_title, user_company_name, user_location, user_duration, user_start_month)
print(f"\nChances of Getting a Paid Internship: {chances * 100:.2f}%")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6485 entries, 0 to 6484
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   internship_title  6485 non-null   object
 1   company_name      6485 non-null   object
 2   location          6485 non-null   object
 3   start_date        6485 non-null   object
 4   duration          6485 non-null   int64 
 5   stipend           6485 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 304.1+ KB
None
                    internship_title       company_name        location  \
count                           6485               6485            6485   
unique                          2162               4656             173   
top     Business Development (Sales)  Top Talent Bridge  Work From Home   
freq                             379                 51            2870   
mean                             NaN                NaN             NaN   
std                

  data['start_date'] = pd.to_datetime(data['start_date'], errors='coerce').fillna(pd.Timestamp('1900-01-01'))


Accuracy: 97.76%
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.20      0.33        35
           1       0.98      1.00      0.99      1262

    accuracy                           0.98      1297
   macro avg       0.93      0.60      0.66      1297
weighted avg       0.98      0.98      0.97      1297

Confusion Matrix:
[[   7   28]
 [   1 1261]]

Feature Importances:
                  importance
company_name        0.498898
internship_title    0.405032
location            0.054431
duration            0.041639
start_month         0.000000

Random Recommendations/Solutions to Hindrances in Obtaining Internships:
{'internship_title': 'Online Reputation Management', 'company_name': 'Avaari', 'location': 'Work From Home', 'duration': 6, 'start_month': 1, 'predicted_paid': 'Paid'}
{'internship_title': 'Telecalling', 'company_name': 'IQnext', 'location': 'Bangalore', 'duration': 6, 'start_month': 1, 'predicted_paid': 'Paid'}
{'in

In [18]:
import pandas as pd
import re
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the CSV file
file_path = "C:\\Users\\ASUS\\Desktop\\internship.csv"
data = pd.read_csv(file_path)

# Improved stipend cleaning function
def clean_stipend(stipend):
    if isinstance(stipend, str):
        stipend = stipend.replace('₹', '').replace('/month', '').replace(',', '').strip()
        match = re.search(r'\d+', stipend)
        if match:
            stipend = int(match.group(0))
        else:
            stipend = 0  # Handle cases like "Unpaid"
    return stipend

data['stipend'] = data['stipend'].apply(clean_stipend)

# Ensure 'duration' column is properly cleaned
def clean_duration(duration):
    if isinstance(duration, str):
        match = re.search(r'\d+', duration)
        if match:
            duration = int(match.group(0))
        else:
            duration = 0
    return duration

data['duration'] = data['duration'].apply(clean_duration)

# Display basic info and check for missing values
print(data.info())

# Display summary statistics
print(data.describe(include='all'))

# Feature Engineering
data['start_date'] = pd.to_datetime(data['start_date'], errors='coerce').fillna(pd.Timestamp('1900-01-01'))
data['start_month'] = data['start_date'].dt.month

# Define target variable
data['paid'] = data['stipend'].apply(lambda x: 1 if x > 0 else 0)

# Select features for the model
features = ['internship_title', 'company_name', 'location', 'duration', 'start_month']
X = data[features]
y = data['paid']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical variables using only the training data
label_encoders = {}
for column in ['internship_title', 'company_name', 'location']:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = X_test[column].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    label_encoders[column] = le

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

# Feature Importance Analysis
feature_importances = pd.DataFrame(model.feature_importances_, index=features, columns=['importance']).sort_values('importance', ascending=False)
print("\nFeature Importances:")
print(feature_importances)

# Generate Random Recommendations/Solutions
def generate_random_recommendations(num_recommendations=5):
    recommendations = []
    for _ in range(num_recommendations):
        random_feature_values = {
            'internship_title': np.random.choice(X_train['internship_title']),
            'company_name': np.random.choice(X_train['company_name']),
            'location': np.random.choice(X_train['location']),
            'duration': np.random.choice(data['duration']),
            'start_month': np.random.choice(data['start_month']),
        }
        features_df = pd.DataFrame([random_feature_values])
        prediction = model.predict(features_df)[0]
        recommendation = {
            'internship_title': label_encoders['internship_title'].inverse_transform([random_feature_values['internship_title']])[0],
            'company_name': label_encoders['company_name'].inverse_transform([random_feature_values['company_name']])[0],
            'location': label_encoders['location'].inverse_transform([random_feature_values['location']])[0],
            'duration': random_feature_values['duration'],
            'start_month': random_feature_values['start_month'],
            'predicted_paid': 'Paid' if prediction == 1 else 'Unpaid'
        }
        recommendations.append(recommendation)
    return recommendations

# Generate and display random recommendations
random_recommendations = generate_random_recommendations()
print("\nRandom Recommendations/Solutions to Hindrances in Obtaining Internships:")
for rec in random_recommendations:
    print(rec)

# Generate Diverse Actionable Solutions
def generate_diverse_solutions(num_solutions=5):
    solutions = []
    possible_actions = [
        lambda: f"Focus on offering more internships with the title '{label_encoders['internship_title'].inverse_transform([random.choice(X_train['internship_title'].value_counts().index[:5])])[0]}'.",
        lambda: f"Encourage companies like '{label_encoders['company_name'].inverse_transform([random.choice(X_train['company_name'].value_counts().index[:5])])[0]}' to provide more internships.",
        lambda: f"Increase internship opportunities in locations like '{label_encoders['location'].inverse_transform([random.choice(X_train['location'].value_counts().index[:5])])[0]}'.",
        lambda: f"Consider offering internships with an average duration of {data['duration'].mean():.0f} months.",
        lambda: f"Promote internships starting in month {random.choice(data['start_month'].value_counts().index[:5])}.",
        lambda: f"Introduce flexible internship durations to attract more candidates.",
        lambda: f"Increase the stipend for internships in less popular locations to attract more applications.",
        lambda: f"Provide remote work options to increase the pool of applicants.",
        lambda: f"Partner with universities to offer credit for internships."
    ]
    
    selected_actions = random.sample(possible_actions, num_solutions)
    for action in selected_actions:
        solutions.append(action())
    return solutions

solutions = generate_diverse_solutions()
print("\nDiverse Solutions to Increase Internship Opportunities:")
for solution in solutions:
    print(solution)

# Predict Chances of Getting an Internship Based on User Input
def predict_internship_chances(internship_title, company_name, location, duration, start_month):
    # Handle unseen labels
    def get_encoded_value(label_encoder, value, default_value):
        if value in label_encoder.classes_:
            return label_encoder.transform([value])[0]
        else:
            print(f"Warning: '{value}' not seen before. Using default '{default_value}' instead.")
            return label_encoder.transform([default_value])[0]
    
    # Get default values for unseen labels
    default_title = data['internship_title'].mode()[0]
    default_company = data['company_name'].mode()[0]
    default_location = data['location'].mode()[0]
    
    # Encode user input with fallback to default values
    encoded_title = get_encoded_value(label_encoders['internship_title'], internship_title, default_title)
    encoded_company = get_encoded_value(label_encoders['company_name'], company_name, default_company)
    encoded_location = get_encoded_value(label_encoders['location'], location, default_location)
    
    # Create a DataFrame with the encoded input
    user_input = pd.DataFrame([{
        'internship_title': encoded_title,
        'company_name': encoded_company,
        'location': encoded_location,
        'duration': duration,
        'start_month': start_month
    }])
    
    # Predict probability
    prediction_prob = model.predict_proba(user_input)[0][1]  # Probability of being a paid internship
    return prediction_prob

# Example user input
user_internship_title = 'Web Development Intern'
user_company_name = 'TCS'
user_location = 'Mumbai'
user_duration = 3
user_start_month = 1

# Predict and display the chances of getting an internship
chances = predict_internship_chances(user_internship_title, user_company_name, user_location, user_duration, user_start_month)
print(f"\nChances of Getting a Paid Internship: {chances * 100:.2f}%")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6485 entries, 0 to 6484
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   internship_title  6485 non-null   object
 1   company_name      6485 non-null   object
 2   location          6485 non-null   object
 3   start_date        6485 non-null   object
 4   duration          6485 non-null   int64 
 5   stipend           6485 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 304.1+ KB
None
                    internship_title       company_name        location  \
count                           6485               6485            6485   
unique                          2162               4656             173   
top     Business Development (Sales)  Top Talent Bridge  Work From Home   
freq                             379                 51            2870   
mean                             NaN                NaN             NaN   
std                

  data['start_date'] = pd.to_datetime(data['start_date'], errors='coerce').fillna(pd.Timestamp('1900-01-01'))


Accuracy: 97.76%
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.20      0.33        35
           1       0.98      1.00      0.99      1262

    accuracy                           0.98      1297
   macro avg       0.93      0.60      0.66      1297
weighted avg       0.98      0.98      0.97      1297

Confusion Matrix:
[[   7   28]
 [   1 1261]]

Feature Importances:
                  importance
company_name        0.498898
internship_title    0.405032
location            0.054431
duration            0.041639
start_month         0.000000

Random Recommendations/Solutions to Hindrances in Obtaining Internships:
{'internship_title': 'Online Reputation Management', 'company_name': 'Avaari', 'location': 'Work From Home', 'duration': 6, 'start_month': 1, 'predicted_paid': 'Paid'}
{'internship_title': 'Telecalling', 'company_name': 'IQnext', 'location': 'Bangalore', 'duration': 6, 'start_month': 1, 'predicted_paid': 'Paid'}
{'in