In [1]:
import pandas as pd

# Load the CSV file
file_path = 'C:\\Users\\gapar\\Downloads\\external_df.csv'

data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
data.head()

Unnamed: 0,problem,level,type
0,Kevin Kangaroo begins hopping on a number line...,Level 5,Algebra
1,The ratio of the areas of two squares is $\fra...,Level 4,Algebra
2,"If $\sqrt{2\sqrt{t-2}} = \sqrt[4]{7 - t}$, the...",Level 4,Algebra
3,Let $t(x) = \sqrt{3x+1}$ and $f(x)=5-t(x)$. Wh...,Level 4,Algebra
4,James has a total of 66 dollars in his piggy b...,Level 2,Algebra


In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords

# Ensure NLTK stop words are available
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Convert set to set for faster operations

forbidden_words = {'draw', 'ticklength', 'laxis', 'arrowsize', 'black', 'gray', 'fill',
                   'cycle', 'linewidth', 'frac', 'graf', 'red', 'asy', 'pmatrix', 'label', 'sqrt',
                   'xmax', 'func', 'unitsize', 'mathop', 'hline', 'mathbf', 'le', 'begin', 'end',
                   'letter', 'vmatrix', 'import', 'overline', 'ax', 'cz', 'bx', 'pa', 'pb', 'pc',
                   'ptick', 'ticks', 'ybottom', 'arrows', 'ymin', 'xmin', 'ymax', 'ytop', 'xleft',
                   'dps', 'pen', 'yequals', 'bool', 'tickdown', 'answer', 'invisible', 'ticksarrx',
                   'ticksarry', 'gx', 'gy', 'gs', 'ceil', 'void', 'bottomtop', 'texnormal', 'wholetickdown',
                   'tickspace', 'ticklen', 'usegrid', 'axispen', 'xstep', 'axisarrowsize', 'xaxis',
                   'yaxis', 'xequals', 'leftright', 'lceil', 'rceil', 'array', 'textnormal', 'linetype',
                   'step', 'size', 'stickframe', 'true', 'false', 'textbf', 'cdot', 'pathticks'}

data['problem'].fillna('', inplace=True)  # Ensure all entries are strings
# Removing stop words
data['filtered_problem'] = data['problem'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Removing forbidden words
regex_forbidden = re.compile(r'\b(' + '|'.join(forbidden_words) + r')\b', re.IGNORECASE)
data['filtered_problem'] = data['filtered_problem'].apply(lambda x: regex_forbidden.sub("", x))

# Removing special characters and digits
data['filtered_problem'] = data['filtered_problem'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data['filtered_problem'] = data['filtered_problem'].apply(lambda x: re.sub(r'\d+', '', x))

# Initialize CountVectorizer for trigrams
vectorizer_trigrams = CountVectorizer(ngram_range=(3,3), max_features=50)

# Extract features and target
X = vectorizer_trigrams.fit_transform(data['filtered_problem'])
y = data['type']  # Assuming you want to classify by 'type'

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# SVM Classifier
clf = make_pipeline(StandardScaler(with_mean=False), SVC(kernel='linear'))
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gapar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['problem'].fillna('', inplace=True)  # Ensure all entries are strings


Accuracy: 0.323
Classification Report:
                         precision    recall  f1-score   support

               Algebra       0.27      0.94      0.41      1167
Counting & Probability       0.41      0.15      0.23       504
              Geometry       0.61      0.27      0.38       528
  Intermediate Algebra       0.69      0.24      0.36       892
         Number Theory       0.63      0.08      0.15       567
            Prealgebra       0.48      0.01      0.03       819
           Precalculus       0.55      0.04      0.08       523

              accuracy                           0.32      5000
             macro avg       0.52      0.25      0.23      5000
          weighted avg       0.50      0.32      0.25      5000



In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from collections import Counter

# Ensure NLTK stop words are available
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

forbidden_words = set([
    'draw', 'ticklength', 'laxis', 'arrowsize', 'black', 'gray', 'fill',
    'cycle', 'linewidth', 'frac', 'graf', 'red', 'asy', 'pmatrix', 'label', 'sqrt',
    'xmax', 'func', 'unitsize', 'mathop', 'hline', 'mathbf', 'le', 'begin', 'end',
    'letter', 'vmatrix', 'import', 'overline', 'ax', 'cz', 'bx', 'pa', 'pb', 'pc',
    'ptick', 'ticks', 'ybottom', 'arrows', 'ymin', 'xmin', 'ymax', 'ytop', 'xleft',
    'dps', 'pen', 'yequals', 'bool', 'tickdown', 'answer', 'invisible', 'ticksarrx',
    'ticksarry', 'gx', 'gy', 'gs', 'ceil', 'void', 'bottomtop', 'texnormal', 'wholetickdown',
    'tickspace', 'ticklen', 'usegrid', 'axispen', 'xstep', 'axisarrowsize', 'xaxis',
    'yaxis', 'xequals', 'leftright', 'lceil', 'rceil', 'array', 'textnormal', 'linetype',
    'step', 'size', 'stickframe', 'true', 'false', 'textbf', 'cdot', 'pathticks'
])

# Load and prepare the data
data['problem'].fillna('', inplace=True)

# Filtering unwanted words
def clean_text(text):
    text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
    text = regex_forbidden.sub("", text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text

data['filtered_problem'] = data['problem'].apply(clean_text)

# Initialize a function to find unique trigrams per category
def find_unique_trigrams(data, category):
    # Vectorize the category specific data
    cat_data = data[data['type'] == category]['filtered_problem']
    vectorizer = CountVectorizer(ngram_range=(3,3), max_features=1000)
    X_cat = vectorizer.fit_transform(cat_data)
    cat_trigrams = vectorizer.get_feature_names_out()
    
    # Vectorize the non-category data
    non_cat_data = data[data['type'] != category]['filtered_problem']
    X_non_cat = vectorizer.fit_transform(non_cat_data)
    non_cat_trigrams = vectorizer.get_feature_names_out()
    
    # Find unique trigrams for the category
    unique_trigrams = set(cat_trigrams) - set(non_cat_trigrams)
    return list(unique_trigrams)[:50]  # return top 50 unique trigrams if available

# Generate features using unique trigrams
categories = data['type'].unique()
unique_trigrams_per_cat = {cat: find_unique_trigrams(data, cat) for cat in categories}
all_trigrams = set(sum([trigs for trigs in unique_trigrams_per_cat.values()], []))

# Adjust vectorizer to use the unique trigrams
vectorizer_trigrams = CountVectorizer(vocabulary=all_trigrams, ngram_range=(3,3))
X = vectorizer_trigrams.fit_transform(data['filtered_problem'])
y = data['type']

# Split and train the classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
clf = make_pipeline(StandardScaler(with_mean=False), SVC(kernel='linear'))
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gapar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['problem'].fillna('', inplace=True)


Accuracy: 0.285
Classification Report:
                         precision    recall  f1-score   support

               Algebra       0.25      0.99      0.40      1167
Counting & Probability       0.90      0.09      0.17       504
              Geometry       0.73      0.08      0.15       528
  Intermediate Algebra       0.75      0.07      0.12       892
         Number Theory       0.80      0.05      0.09       567
            Prealgebra       0.73      0.05      0.09       819
           Precalculus       0.88      0.10      0.18       523

              accuracy                           0.28      5000
             macro avg       0.72      0.20      0.17      5000
          weighted avg       0.66      0.28      0.19      5000

