**Part 1 of Feature Engineering and making the the final model**


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os

folder_path = '/content/drive/MyDrive/Reference'
files = os.listdir(folder_path)
print("Files in folder:", files)

Files in folder: ['Publishable', 'Non-Publishable', '.DS_Store']


In [None]:
import os
import pandas as pd

folder_path = '/content/drive/MyDrive/Reference'

data = []
for label in ['Publishable', 'Non-Publishable']:
    label_folder = os.path.join(folder_path, label)
    for filename in os.listdir(label_folder):
        file_path = os.path.join(label_folder, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
        except UnicodeDecodeError:
            try:
                with open(file_path, 'r', encoding='latin-1') as f:
                    text = f.read()
            except UnicodeDecodeError:
                print(f"Skipping file {filename} due to encoding issues.")
                continue
        data.append({'text': text, 'label': label})


IsADirectoryError: [Errno 21] Is a directory: '/content/drive/MyDrive/Reference/Publishable/NeurIPS'

In [None]:
import os
import pdfplumber
import string
import pandas as pd
import textstat
from textblob import TextBlob
import language_tool_python

# Initialize LanguageTool
tool = language_tool_python.LanguageTool('en-US')

# Clean text function
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.strip()

# Extract sections function
def extract_sections(text):
    abstract, intro, conclusion = "", "", ""
    text_lower = text.lower()

    abstract_start = text_lower.find("abstract")
    if abstract_start != -1:
        abstract_end = text_lower.find("introduction", abstract_start)
        abstract = text[abstract_start:abstract_end if abstract_end != -1 else len(text)]

    intro_start = text_lower.find("introduction")
    if intro_start != -1:
        intro_end = text_lower.find("conclusion", intro_start)
        intro = text[intro_start:intro_end if intro_end != -1 else len(text)]

    conclusion_start = text_lower.find("conclusion")
    if conclusion_start != -1:
        conclusion = text[conclusion_start:len(text)]

    return clean_text(abstract), clean_text(intro), clean_text(conclusion)

# Perform sentiment analysis
def sentiment_analysis(text):
    sentiment = TextBlob(text).sentiment.polarity
    return sentiment

# Function to check grammar and passive voice
def check_writing_quality(text):
    matches = tool.check(text)
    passive_voice_count = sum(1 for match in matches if 'Passive voice' in match.message)
    return {
        'grammar_errors': len(matches),
        'passive_voice_count': passive_voice_count
    }

folder_path = '/content/drive/MyDrive/Reference'

data = []

# Process PDF files
for category in ["Publishable", "Non-Publishable"]:
    category_path = os.path.join(folder_path, category)

    for file_name in os.listdir(category_path):
        file_path = os.path.join(category_path, file_name)

        if file_name.endswith('.pdf'):
            try:
                with pdfplumber.open(file_path) as pdf:
                    text = ""
                    for page in pdf.pages:
                        page_text = page.extract_text()
                        if page_text:
                            text += page_text + "\n"

                    abstract, intro, conclusion = extract_sections(text)

                    readability_score = textstat.flesch_kincaid_grade(conclusion)

                    flesch_score = textstat.flesch_reading_ease(conclusion)

                    sentiment_score = sentiment_analysis(conclusion)

                    writing_quality_features = check_writing_quality(conclusion)

                    data.append({
                        'paper_name': file_name,
                        'abstract': abstract,
                        'introduction': intro,
                        'conclusion': conclusion,
                        'conclusion_length': len(conclusion.split()),
                        'readability_score': readability_score,
                        'flesch_score': flesch_score,
                        'sentiment_score': sentiment_score,
                        'grammar_errors': writing_quality_features['grammar_errors'],
                        'passive_voice_count': writing_quality_features['passive_voice_count'],
                        'label': 0 if category == "Non-Publishable" else 1
                    })
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

df = pd.DataFrame(data)

df = df.rename(columns={
    'paper_name': 'paper_name',
    'abstract': 'abstract',
    'introduction': 'introduction',
    'conclusion': 'conclusion',
    'conclusion_length': 'conclusion_length',
    'readability_score': 'readability_score',
    'flesch_score': 'flesch_score',
    'sentiment_score': 'sentiment_score',
    'grammar_errors': 'grammar_errors',
    'passive_voice_count': 'passive_voice_count',
    'label': 'label'
})

# Display the updated DataFrame
print(f"DataFrame created with {len(df)} entries.")
print(df.head(15))


In [None]:
print(list(df.columns))


In [None]:
# Select only the required columns
selected_columns = ['conclusion_length', 'readability_score', 'flesch_score', 'sentiment_score', 'grammar_errors',
                     'label']

new_df = df[selected_columns]

print(new_df.head())


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = new_df.drop(columns=['label'])
y = new_df['label']

# Convert to numpy arrays
X_features = X.to_numpy()
y_labels = y.to_numpy()

# Split the data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.3, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


In [None]:
from lazypredict.Supervised import LazyClassifier
import pandas as pd

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Train and evaluate the models
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

print("Model Comparison:")
print(models)

# Display the top-performing models
print("\nTop 5 Models:")
print(models.head(5))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0]
}

# Perform Grid Search
grid_search = GridSearchCV(AdaBoostClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.2f}")

best_adaboost = grid_search.best_estimator_
y_pred = best_adaboost.predict(X_test)

# Re-evaluate the model
print("Optimized AdaBoostClassifier Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize AdaBoostClassifier
adaboost_clf = AdaBoostClassifier(n_estimators=50, learning_rate= 0.01 ,random_state=42 )


# Train the AdaBoostClassifier on the training data
adaboost_clf.fit(X_train, y_train)
# Predict on the test data
y_pred = adaboost_clf.predict(X_test)
# Evaluate the AdaBoostClassifier
print("AdaBoostClassifier Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



In [None]:
import joblib

#Joblib for downloading the mode
model_path = "IITKGP.joblib"
joblib.dump(adaboost_clf, model_path)
print(f"Model saved to {model_path}")

**Part 2 for using the saved model for doing the prediction:**

In [None]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model_path = "/content/IITKGP.joblib"

# Load the saved AdaBoost model
loaded_model = joblib.load(model_path)
print("Model loaded successfully.")

In [None]:
import os

folder_path = '/content/drive/MyDrive/Papers'
files = os.listdir(folder_path)
print("Files in folder:", files)

In [None]:
import os
import pdfplumber
import string
import pandas as pd
import textstat
from textblob import TextBlob
import language_tool_python

# Initialize LanguageTool
tool = language_tool_python.LanguageTool('en-US')

# Clean text function
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.strip()

# Extract sections function
def extract_sections(text):
    abstract, intro, conclusion = "", "", ""
    text_lower = text.lower()

    abstract_start = text_lower.find("abstract")
    if abstract_start != -1:
        abstract_end = text_lower.find("introduction", abstract_start)
        abstract = text[abstract_start:abstract_end if abstract_end != -1 else len(text)]

    intro_start = text_lower.find("introduction")
    if intro_start != -1:
        intro_end = text_lower.find("conclusion", intro_start)
        intro = text[intro_start:intro_end if intro_end != -1 else len(text)]

    conclusion_start = text_lower.find("conclusion")
    if conclusion_start != -1:
        conclusion = text[conclusion_start:len(text)]

    return clean_text(abstract), clean_text(intro), clean_text(conclusion)

# Perform sentiment analysis
def sentiment_analysis(text):
    sentiment = TextBlob(text).sentiment.polarity
    return sentiment

# Function to check grammar and passive voice
def check_writing_quality(text):
    matches = tool.check(text)
    passive_voice_count = sum(1 for match in matches if 'Passive voice' in match.message)
    return {
        'grammar_errors': len(matches),
        'passive_voice_count': passive_voice_count
    }

folder_path = '/content/drive/MyDrive/Papers'

data = []

# Process PDF files in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)

    if file_name.endswith('.pdf'):
        try:
            with pdfplumber.open(file_path) as pdf:
                text = ""
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"

                abstract, intro, conclusion = extract_sections(text)

                readability_score = textstat.flesch_kincaid_grade(conclusion)

                flesch_score = textstat.flesch_reading_ease(conclusion)

                sentiment_score = sentiment_analysis(conclusion)

                writing_quality_features = check_writing_quality(conclusion)

                data.append({
                    'paper_name': file_name,
                    'abstract': abstract,
                    'introduction': intro,
                    'conclusion': conclusion,
                    'conclusion_length': len(conclusion.split()),
                    'readability_score': readability_score,
                    'flesch_score': flesch_score,
                    'sentiment_score': sentiment_score,
                    'grammar_errors': writing_quality_features['grammar_errors'],
                    'passive_voice_count': writing_quality_features['passive_voice_count'],
                    'label': 0
                })
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

# Create DataFrame
df = pd.DataFrame(data)

df = df.rename(columns={
    'paper_name': 'paper_name',
    'abstract': 'abstract',
    'introduction': 'introduction',
    'conclusion': 'conclusion',
    'conclusion_length': 'conclusion_length',
    'readability_score': 'readability_score',
    'flesch_score': 'flesch_score',
    'sentiment_score': 'sentiment_score',
    'grammar_errors': 'grammar_errors',
    'passive_voice_count': 'passive_voice_count',
    'label': 'label'
})

print(f"DataFrame created with {len(df)} entries.")
print(df.head(15))

In [None]:
print(list(df.columns))


In [None]:
selected_columns = ['conclusion_length', 'readability_score', 'flesch_score', 'sentiment_score', 'grammar_errors']

# Create a new DataFrame with only these columns
new_df = df[selected_columns]

print(new_df.head())


In [None]:
# Make predictions on the new_df
predictions = loaded_model.predict(new_df)


new_df['predicted_label'] = predictions

print(new_df.head())

In [None]:
final_df = pd.concat([df['paper_name'], new_df['predicted_label']], axis=1)

final_df['predicted_label'] = final_df['predicted_label'].replace({0: 'Unpublishable', 1: 'Publishable'})

# Sort the DataFrame based on 'paper_name' without altering the serial number
final_df = final_df.sort_values(by='paper_name', ascending=True)

# Save the sorted DataFrame to a CSV file, without writing the index
output_path = '/content/final_predictions.csv'
final_df.to_csv(output_path, index=False)