In [None]:
import pandas as pd

In [None]:
# Use the correct relative file path to the uploaded CSV file
file_path = 'Admission_Predict.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

In [None]:
# Display the number of rows and columns in the dataset
print(data.shape)

# Display the column names
print(data.columns)

# Display the data types of each column
print(data.dtypes)


In [None]:
# Check for missing values in each column
print(data.isnull().sum())


In [None]:
# Display summary statistics for numerical columns
print(data.describe())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create histograms for numerical features
numerical_features = ['GRE Score', 'TOEFL Score', 'CGPA']
data[numerical_features].hist(bins=20, figsize=(12, 6))
plt.show()


In [None]:

# Drop the "Serial No." column
data_without_serial = data.drop(columns=['Serial No.'])

# Create a correlation coefficients heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(data_without_serial.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Factors Heat Map', color='black', size=20)
plt.show()


In [None]:
# Create a scatter plot of GRE Score vs. Chance of Admit
plt.figure(figsize=(8, 4))
sns.scatterplot(x='GRE Score', y='Chance of Admit ', data=data)
plt.title('GRE Score vs. Chance of Admit', fontsize=16)
plt.xlabel('GRE Score', fontsize=14)
plt.ylabel('Chance of Admit', fontsize=14)
plt.show()

# Description of the scatter plot
print("The scatter plot illustrates the relationship between applicants' GRE Scores and their corresponding Chance of Admit to a graduate program. The plot reveals a distinctive trend where, on average, higher GRE Scores are associated with a greater Chance of Admit. This trend is highlighted by the upward-sloping linear pattern that emerges as you move from left to right along the x-axis.")

# Create a histogram and KDE plot of GRE Score
plt.figure(figsize=(8, 4))
sns.histplot(data['GRE Score'], bins=20, kde=True)
plt.title('Distribution of GRE Score', fontsize=16)
plt.xlabel('GRE Score', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a probability distribution plot for GRE Score with adjusted size
plt.figure(figsize=(6, 4))  # Adjust the figsize values as needed
sns.distplot(data['GRE Score']).set_title('Probability Distribution for GRE Test Scores', size=20)
plt.xlabel('GRE Score', size=14)
plt.ylabel('Probability Density', size=14)
plt.show()

# Comment about the distribution
print("The probability distribution plot showcases the distribution of GRE test scores. The plot suggests that the GRE scores are somewhat normally distributed, with a peak around the center of the scores. This indicates that a significant number of applicants have scores around the mean, and the frequency decreases as scores deviate from the mean in both directions.")


In [None]:
GRE_CORR = pd.DataFrame(data.corr()['GRE Score'])
GRE_CORR.drop(index=['GRE Score', 'Serial No.'], inplace=True)
GRE_CORR.rename({'GRE Score': 'GRE Correlation Coeff'}, axis=1, inplace=True)
GRE_CORR


In [None]:
print("Chance of Admit: 0.803 - A strong positive correlation indicates that higher GRE scores are strongly related to a higher chance of admission.")

In [None]:
# Create a scatter plot of TOEFL Score vs. Chance of Admit
plt.figure(figsize=(8, 4))
sns.scatterplot(x='TOEFL Score', y='Chance of Admit ', data=data)
plt.title('TOEFL Score vs. Chance of Admit', fontsize=16)
plt.xlabel('TOEFL Score', fontsize=14)
plt.ylabel('Chance of Admit', fontsize=14)
plt.show()

# Description of the scatter plot
print("The scatter plot illustrates the relationship between applicants' TOEFL Scores and their corresponding Chance of Admit to a graduate program. The plot suggests a positive correlation between higher TOEFL Scores and a higher likelihood of admission. While there's a general upward trend, there is also variability in the data points, indicating that other factors might also play a role in determining admission chances.")


In [None]:

# Create a probability distribution plot for TOEFL Score with adjusted size
plt.figure(figsize=(6, 4))  # Adjust the figsize values as needed
sns.distplot(data['TOEFL Score']).set_title('Probability Distribution for TOEFL Test Scores', size=20)
plt.xlabel('TOEFL Score', size=14)
plt.ylabel('Probability Density', size=14)
plt.show()

# Comment about the distribution
print("The probability distribution plot showcases the distribution of TOEFL test scores. The plot suggests that the TOEFL scores are somewhat normally distributed, with a peak around the center of the scores. This indicates that a significant number of applicants have scores around the mean, and the frequency decreases as scores deviate from the mean in both directions.")


In [None]:


# Distribution of University Rating
plt.figure(figsize=(6, 4))
sns.countplot(x='University Rating', data=data)
plt.title('Distribution of University Rating', fontsize=16)
plt.xlabel('University Rating', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()

# Average Chance of Admit by University Rating
plt.figure(figsize=(8, 5))
sns.barplot(x='University Rating', y='Chance of Admit ', data=data)
plt.title('Average Chance of Admit by University Rating', fontsize=16)
plt.xlabel('University Rating', fontsize=14)
plt.ylabel('Average Chance of Admit', fontsize=14)
plt.show()


In [None]:
print("most of the students")

In [None]:
# Stacked Bar Plot: Research by University Rating
research_by_rating = data.groupby(['University Rating', 'Research'])['Research'].count().unstack()
research_by_rating.plot(kind='bar', stacked=True, figsize=(8, 6))
plt.title('Research Distribution by University Rating', fontsize=16)
plt.xlabel('University Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Research', labels=['No Research', 'Research'])
plt.show()


In [None]:

# Distribution of SOP Ratings
plt.figure(figsize=(6, 4))
sns.countplot(x='SOP', data=data)
plt.title('Distribution of Statement of Purpose (SOP) Ratings', fontsize=16)
plt.xlabel('SOP Rating', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()

# Box Plot: SOP Rating vs. Chance of Admit
plt.figure(figsize=(8, 6))
sns.boxplot(x='SOP', y='Chance of Admit ', data=data)
plt.title('Statement of Purpose (SOP) Rating vs. Chance of Admit', fontsize=16)
plt.xlabel('SOP Rating', fontsize=14)
plt.ylabel('Chance of Admit', fontsize=14)
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Distribution of LOR Ratings
plt.figure(figsize=(6, 4))
sns.countplot(x='LOR ', data=data)
plt.title('Distribution of Letter of Recommendation (LOR) Ratings', fontsize=16)
plt.xlabel('LOR Rating', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()

# Box Plot: LOR Rating vs. Chance of Admit
plt.figure(figsize=(8, 6))
sns.boxplot(x='LOR ', y='Chance of Admit ', data=data)
plt.title('Letter of Recommendation (LOR) Rating vs. Chance of Admit', fontsize=16)
plt.xlabel('LOR Rating', fontsize=14)
plt.ylabel('Chance of Admit', fontsize=14)
plt.show()


In [None]:

# Distribution of CGPA Scores
plt.figure(figsize=(6, 4))
sns.histplot(data['CGPA'], bins=10, kde=True)
plt.title('Distribution of CGPA Scores', fontsize=16)
plt.xlabel('CGPA', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()

# Scatter Plot: CGPA vs. Chance of Admit
plt.figure(figsize=(8, 6))
sns.scatterplot(x='CGPA', y='Chance of Admit ', data=data)
plt.title('CGPA vs. Chance of Admit', fontsize=16)
plt.xlabel('CGPA', fontsize=14)
plt.ylabel('Chance of Admit', fontsize=14)
plt.show()


In [None]:


# Distribution of Research Experience
plt.figure(figsize=(6, 4))
sns.countplot(x='Research', data=data)
plt.title('Distribution of Research Experience', fontsize=16)
plt.xlabel('Research Experience', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks([0, 1], ['No Research', 'Research'])
plt.show()


In [None]:

# Calculate the count of research experience
research_count = data['Research'].value_counts()

# Calculate the percentage of research experience
research_percentage = research_count / len(data) * 100

# Create a bar plot
plt.figure(figsize=(6, 4))
research_count.plot(kind='bar', color=['blue', 'green'])
plt.title('Count of Research Experience')
plt.xlabel('Research Experience')
plt.ylabel('Count')
plt.xticks([0, 1], ['No Research', 'Research'], rotation=0)
plt.tight_layout()

# Display the plot
plt.show()

# Print the percentage
print("Percentage of Research Experience:")
print(research_percentage)


In [None]:
print(data.columns)


In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import plot_tree

# Assuming your dataset is named 'data'
X = data.drop(columns=['Chance of Admit '])  # Exclude irrelevant columns
y = data['Chance of Admit '] > 0.5  # Convert to binary classification (Admitted: True, Not Admitted: False)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree classifier
model = DecisionTreeClassifier(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)


# Visualize the Decision Tree
plt.figure(figsize=(15, 10))
plot_tree(model, feature_names=X.columns, class_names=['Not Admitted', 'Admitted'], filled=True, rounded=True)
plt.show()

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print(f"Accuracy: {accuracy:.2f}")




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Select only the most important features based on feature importance analysis
selected_features = ['GRE Score', 'TOEFL Score', 'CGPA']

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Define the hyperparameter grid
param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize Decision Tree classifier
dtc = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(dtc, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the selected features
grid_search.fit(X_train_selected, y_train)

# Get the best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set with selected features
best_accuracy = best_model.score(X_test_selected, y_test)
print("Best Hyperparameters:", best_params)
print("Best Model Accuracy:", best_accuracy)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define the hyperparameter grid
param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize Decision Tree classifier
dtc = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(dtc, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_accuracy = best_model.score(X_test, y_test)
print("Best Hyperparameters:", best_params)
print("Best Model Accuracy:", best_accuracy)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Initialize Decision Tree classifier with best hyperparameters
best_model = DecisionTreeClassifier(max_depth=None, max_features='auto', min_samples_leaf=4, min_samples_split=10, random_state=42)

# Train the best model on the entire training dataset using selected features
X_train_selected = X_train[selected_features]
best_model.fit(X_train_selected, y_train)

# Prepare the test dataset with selected features
X_test_selected = X_test[selected_features]

# Make predictions on the test dataset
y_pred = best_model.predict(X_test_selected)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
from sklearn.metrics import classification_report

# Print classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


In [None]:
from sklearn.metrics import classification_report

# Print classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state=1)

# Train the model
rfc.fit(X_train, y_train)

# Predict on test data
y_pred_rfc = rfc.predict(X_test)

# Calculate and print accuracy
accuracy = rfc.score(X_test, y_test)
print("Accuracy:", accuracy)

# Calculate confusion matrix
cm_rfc = confusion_matrix(y_test, y_pred_rfc)
print("Confusion Matrix:")
print(cm_rfc)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred_rfc)
recall = recall_score(y_test, y_pred_rfc)
f1 = f1_score(y_test, y_pred_rfc)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Confusion matrix visualization for test dataset
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_rfc, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.title("Confusion Matrix - Test Dataset")
plt.xlabel("Predicted y values")
plt.ylabel("Real y values")
plt.show()

# Confusion matrix visualization for train dataset
cm_rfc_train = confusion_matrix(y_train, rfc.predict(X_train))
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_rfc_train, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.title("Confusion Matrix - Train Dataset")
plt.xlabel("Predicted y values")
plt.ylabel("Real y values")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Dropping the "Serial No." column and separating features and target variable
X = data.drop(["Serial No.", "Chance of Admit "], axis=1)  # Features
y = data["Chance of Admit "]  # Target variable

# Converting "Chance of Admit" to binary labels
threshold = 0.5  # You can adjust this threshold if needed
y_binary = (y >= threshold).astype(int)

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state=1)

# Train the model
rfc.fit(X_train, y_train)

# Predict on test data
y_pred_rfc = rfc.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_rfc)
print("Accuracy:", accuracy)

# Calculate confusion matrix
cm_rfc = confusion_matrix(y_test, y_pred_rfc)
print("Confusion Matrix:")
print(cm_rfc)

# Print classification report
class_report_rfc = classification_report(y_test, y_pred_rfc)
print("Classification Report:")
print(class_report_rfc)

# Confusion matrix visualization for test dataset
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_rfc, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.title("Confusion Matrix - Test Dataset")
plt.xlabel("Predicted y values")
plt.ylabel("Real y values")
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming 'rfc' is your trained RandomForestClassifier
feature_importances = rfc.feature_importances_

# Get the names of the features
feature_names = X.columns  # Assuming X is your feature matrix

# Sort feature importance scores in descending order
sorted_idx = np.argsort(feature_importances)[::-1]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), feature_importances[sorted_idx])
plt.xticks(range(X.shape[1]), np.array(feature_names)[sorted_idx], rotation=90)
plt.title("Feature Importance")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize RandomForestClassifier
rfc = RandomForestClassifier(random_state=1)

# Initialize GridSearchCV
grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy')

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize RandomForestClassifier with the best parameters
best_rfc = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    random_state=1
)

# Train the model on the complete training dataset with the best features
best_rfc.fit(X_train, y_train)

# Predict on the test data
y_pred_best = best_rfc.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_best)
print("Accuracy:", accuracy)

# Print classification report for more detailed evaluation
class_report = classification_report(y_test, y_pred_best)
print("Classification Report:")
print(class_report)
