## Extracting Text from pdf

In [84]:
import pdfplumber
import os

def extract_text(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()  # helps in extracting text from the pdf
    return text


file_path = r"D:\publishable-Advisor\dataset\Reference\Publishable - Non Confrence\R006.pdf"
print(extract_text(file_path=file_path))

Detailed Action Identification in Baseball Game
Recordings
Abstract
ThisresearchintroducesMLB-YouTube,anewandcomplexdatasetcreatedfor
nuanced activity recognition in baseball videos. This dataset is structured to
supporttwotypesofanalysis: oneforclassifyingactivitiesinsegmentedvideos
andanotherfordetectingactivitiesinunsegmented,continuousvideostreams. This
studyevaluatesseveralmethodsforrecognizingactivities,focusingonhowthey
capture the temporal organization of activities in videos. This evaluation starts
with categorizing segmented videos and progresses to applying these methods
tocontinuousvideofeeds. Additionally,thispaperassessestheeffectivenessof
different models in the challenging task of forecasting pitch velocity and type
usingbaseballbroadcastvideos. Thefindingsindicatethatincorporatingtemporal
dynamicsintomodelsisbeneficialfordetailedactivityrecognition.
1 Introduction
Actionrecognition,asignificantproblemincomputervision,findsextensiveuseinsports. Profes-
sionalsportingeve

## Creating a Script to automate the process of this 

In [85]:
def extract_text(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()  # helps in extracting text from the pdf
    return text

pdf_folder = r"D:\publishable-Advisor\dataset\Reference\Publishable - Non Confrence"
pdf_text = []

for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_folder,filename)
        text = extract_text(file_path)
        pdf_text.append(text)


# Sving the extracted text to a text file
for i, text in enumerate(pdf_text):
    with open(rf"D:\publishable-Advisor\dataset\txt format\publishable\{i+1}.txt", "w",encoding="utf-8") as f: # To avoid encoding error using utf-8
        f.write(text)

## using the same script for the non publishable papers

In [86]:
pdf_folder = r"D:\publishable-Advisor\dataset\Reference\Non-Publishable"
pdf_text = []
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_folder, filename)
        text = extract_text(file_path)
        pdf_text.append(text)


for i,text in enumerate(pdf_text):
    with open(rf"D:\publishable-Advisor\dataset\txt format\non punlishable\{i+1}.txt", "w", encoding = "utf-8") as f:
        f.write(text)

## Further Text cleaning 

In [87]:
import re

def clean_txt(text):
    text = text.lower()
    text = re.sub(r'[^\w\s.,-]', '', text)
    ## removing unwanted digits

    text = re.sub(r'\b\d{1,2}\b', '', text)

    ## removing extra spaces
    text = re.sub(r'\s+'," ", text).strip()
    return text


# folder for publishable and non publishable text files
publishable_folder = r"D:\publishable-Advisor\dataset\txt format\publishable"
non_publishable_folder = r"D:\publishable-Advisor\dataset\txt format\non punlishable"

# lists for holding the text and there corresponding labels 
texts = []
labels = []

# processing first for the publishable text files
for i in range(1,11):
    with open (os.path.join(publishable_folder, f"{i}.txt"), "r", encoding = "utf-8") as f:
        text = f.read()
        text = clean_txt(text=text)
        texts.append(text)
        labels.append(1) ## ublishable text files are labeled as 1


for i in range(1,6):
    with open(os.path.join(non_publishable_folder, f"{i}.txt"), "r", encoding = "utf-8") as f:
        text = f.read()
        text = clean_txt(text=text)
        texts.append(text)
        labels.append(0)


#sample check
print(texts)
print(labels)


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


## Represnting text as features 

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing the vectorizer
vectorizer = TfidfVectorizer(max_features = 1000)

X = vectorizer.fit_transform(texts)
print(X.shape)
print(X.toarray()) # to see the TDF-IDF matrix

(15, 1000)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.10238657 0.07678993 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.6, random_state=79)

# Initialize the Random Forest model
model = RandomForestClassifier(class_weight='balanced', random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Detailed evaluation
print(classification_report(y_test, y_pred))



Accuracy: 0.7777777777777778
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.75      1.00      0.86         6

    accuracy                           0.78         9
   macro avg       0.88      0.67      0.68         9
weighted avg       0.83      0.78      0.74         9



In [90]:
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Function to extract text from a PDF
def extract_text(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()  # Extracts text from each page
    return text

# Function to clean the extracted text
def clean_txt(text):
    text = text.lower()
    text = re.sub(r'[^\w\s.,-]', '', text)  # Removing unwanted characters
    text = re.sub(r'\b\d{1,2}\b', '', text)  # Removing numbers
    text = re.sub(r'\s+', " ", text).strip()  # Removing extra spaces
    return text

# Load your dataset or a small sample of text to fit the vectorizer
# Example: A list of texts used to fit the vectorizer and model
texts = ["sample text from previous documents", "another sample text"]

# Initialize the vectorizer and fit it on the existing data
vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(texts)  # Fit the vectorizer on your texts

# Initialize and train the Random Forest model
model = RandomForestClassifier(class_weight='balanced', random_state=42)
labels = [1, 0]  # Example labels for the text dataset
model.fit(X, labels)

# Sample PDF path to check
sample_pdf_path = r"D:\publishable-Advisor\dataset\Reference\Non-Publishable\R002.pdf"

# Extract text from the sample PDF
extracted_text = extract_text(file_path=sample_pdf_path)

# Clean the extracted text
cleaned_text = clean_txt(extracted_text)

# Transform the cleaned text using the fitted vectorizer
X_new = vectorizer.transform([cleaned_text])  # Transform using the fitted vectorizer

# Predict the class (publishable or not)
prediction = model.predict(X_new)

# Output the result
if prediction == 1:
    print("The PDF is classified as Publishable.")
else:
    print("The PDF is classified as Non-Publishable.")


The PDF is classified as Publishable.
