In [25]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker()
# Set random seed for reproducibility
np.random.seed(42)

# Number of rows (candidates)
n_rows = 50000

# Generate candidate IDs
candidate_ids = np.arange(1, n_rows + 1)

# Generate random names using Faker
candidate_names = [fake.name() for _ in range(n_rows)]

# Generate random scores for each subject
java_scores = np.random.randint(0, 101, n_rows)
dotnet_scores = np.random.randint(0, 101, n_rows)
data_engineering_scores = np.random.randint(0, 101, n_rows)

# Create DataFrame
df = pd.DataFrame({
    'candidate_id': candidate_ids,
    'candidate_name':candidate_names,
    'java_score': java_scores,
    'dotnet_score': dotnet_scores,
    'data_engineering_score': data_engineering_scores,
})

# Function to recommend subject based on the lowest score
def recommend_subject(row):
    subjects = {
        'Java': row['java_score'],
        '.NET': row['dotnet_score'],
        'Data Engineering': row['data_engineering_score']
    }
    # Recommend the subject with the lowest score
    return min(subjects, key=subjects.get)

# Apply recommendation function
df['recommended_subject'] = df.apply(recommend_subject, axis=1)

# Display the first few rows of the dataset
print(df.head())

# Save to CSV
df.to_csv('synthetic_data.csv', index=False)


   candidate_id  candidate_name  java_score  dotnet_score  \
0             1    Robert Burns          51            76   
1             2  Tyler Williams          92            20   
2             3  Michael Rhodes          14            21   
3             4  Melanie Thomas          71            33   
4             5   Austin Wilson          60             9   

   data_engineering_score recommended_subject  
0                      95                Java  
1                      21                .NET  
2                      88                Java  
3                      10    Data Engineering  
4                      74                .NET  


In [86]:
df = pd.read_csv("synthetic_data.csv")
df.head()

Unnamed: 0,candidate_id,candidate_name,java_score,dotnet_score,data_engineering_score,recommended_subject
0,1,Robert Burns,51,76,95,Java
1,2,Tyler Williams,92,20,21,.NET
2,3,Michael Rhodes,14,21,88,Java
3,4,Melanie Thomas,71,33,10,Data Engineering
4,5,Austin Wilson,60,9,74,.NET


In [87]:
df.shape

(50000, 6)

In [88]:
df.isna().sum()

candidate_id              0
candidate_name            0
java_score                0
dotnet_score              0
data_engineering_score    0
recommended_subject       0
dtype: int64

In [89]:
df['recommended_subject'].value_counts()

recommended_subject
Java                16922
.NET                16573
Data Engineering    16505
Name: count, dtype: int64

In [90]:
from sklearn.preprocessing import LabelEncoder

In [91]:
encoder = LabelEncoder()
df_recommened=encoder.fit_transform(df.recommended_subject)

In [92]:
df_recommened = pd.DataFrame(df_recommened, columns=['new_recommended_subject'])

In [93]:
df_recommened.head()

Unnamed: 0,new_recommended_subject
0,2
1,0
2,2
3,1
4,0


In [94]:
df = pd.concat([df, df_recommened], axis=1)

In [95]:
df.head()

Unnamed: 0,candidate_id,candidate_name,java_score,dotnet_score,data_engineering_score,recommended_subject,new_recommended_subject
0,1,Robert Burns,51,76,95,Java,2
1,2,Tyler Williams,92,20,21,.NET,0
2,3,Michael Rhodes,14,21,88,Java,2
3,4,Melanie Thomas,71,33,10,Data Engineering,1
4,5,Austin Wilson,60,9,74,.NET,0


In [96]:
X = df[["java_score","dotnet_score","data_engineering_score"]]
y = df.new_recommended_subject

In [97]:
X.shape

(50000, 3)

In [98]:
y.shape

(50000,)

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [64]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [65]:
y_pred = model.predict(X_test)

In [66]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [67]:
print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred, y_test))

[[4886    8    5]
 [   9 5026   13]
 [  11    8 5034]]
0.9964


In [78]:
model.predict([[70,12,34]])



array([0])

In [80]:
import pickle as pkl
with open('random_forest.pkl','wb') as f:
    pkl.dump(model,f)

In [108]:
from tensorflow.keras.models import Sequential
import tensorflow as tf
from tensorflow.keras.layers import Flatten, Dense, BatchNormalization, Dropout
models = Sequential()
models.add(Flatten(input_shape=[3,]))
models.add(Dense(units=64, activation='relu', kernel_regularizer= tf.keras.regularizers.l1(1e-3)))
models.add(Dropout(0.2))
models.add(BatchNormalization())
models.add(Dense(units=32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4)))
models.add(Dropout(0.1))
models.add(BatchNormalization())
models.add(Dense(units=16, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(1e-3, 1e-5)))
models.add(BatchNormalization())
models.add(Dense(units=3, activation='softmax'))
models.summary()

  super().__init__(**kwargs)


In [109]:
models.compile(optimizer='adam', loss="sparse_categorical_crossentropy",metrics=["accuracy"])
hist = models.fit(X_train,y_train, validation_data=(X_test, y_test), epochs=15, batch_size = 63)

Epoch 1/15
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8179 - loss: 0.5571 - val_accuracy: 0.9707 - val_loss: 0.1913
Epoch 2/15
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9421 - loss: 0.2389 - val_accuracy: 0.9819 - val_loss: 0.1378
Epoch 3/15
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9480 - loss: 0.2016 - val_accuracy: 0.9783 - val_loss: 0.1224
Epoch 4/15
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9544 - loss: 0.1724 - val_accuracy: 0.9791 - val_loss: 0.1069
Epoch 5/15
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9620 - loss: 0.1493 - val_accuracy: 0.9085 - val_loss: 0.3607
Epoch 6/15
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9636 - loss: 0.1365 - val_accuracy: 0.9917 - val_loss: 0.0733
Epoch 7/15
[1m556/556[0m 

In [111]:
y_pred =models.predict(X_test)

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [1]:
##text analysis

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [4]:
df = pd.read_excel("ReadyToTrain_data_2col_with_subjectivity_final.xlsx")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,StudentComments,Rating,totalwords,Sentiment,sent_pretrained,subjectivity,subj-score,isSame
0,0,good,4.96,1,positive,positive,subjective,0.6,True
1,1,good,5.0,1,positive,positive,subjective,0.6,True
2,2,teacher,4.25,1,positive,neutral,objective,0.0,fake
3,3,friendly teacher but not enough ability to enc...,4.38,10,positive,neutral,subjective,0.5,fake
4,4,teacher,4.92,1,positive,neutral,objective,0.0,fake


In [6]:
df.drop("Unnamed: 0", axis =1, inplace = True)

In [7]:
df.head()

Unnamed: 0,StudentComments,Rating,totalwords,Sentiment,sent_pretrained,subjectivity,subj-score,isSame
0,good,4.96,1,positive,positive,subjective,0.6,True
1,good,5.0,1,positive,positive,subjective,0.6,True
2,teacher,4.25,1,positive,neutral,objective,0.0,fake
3,friendly teacher but not enough ability to enc...,4.38,10,positive,neutral,subjective,0.5,fake
4,teacher,4.92,1,positive,neutral,objective,0.0,fake


In [11]:
df.shape

(1048575, 8)

In [12]:
df.columns

Index(['StudentComments', 'Rating', 'totalwords', 'Sentiment',
       'sent_pretrained', 'subjectivity', 'subj-score', 'isSame'],
      dtype='object')