In [1]:
from google.colab import drive
import pandas as pd
# Below will prompt for authorization
drive.mount('/content/drive')

Mounted at /content/drive


## Research Question
1. Are there differences in important words between simplified and authentic reading texts?

### Remember the process:
1. Clean and tokenize your data (or set up a tokenization function that you can use with the TfidfVectorizer)
2. Generate at TF-IDF matrix
3. Use the matrix to train an ML model
4. Interpret your results! Depending on the model you use, this could take different forms. You can look at coefficients or at clusters.

In [2]:
# Load data

from google.colab import drive
import pandas as pd
# Below will prompt for authorization
drive.mount('/content/drive')

df = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/DS_5780_spring_25/reading_600_texts.csv",
    encoding='ISO-8859-1'
    )
# We always want to look at the information about the dataset as a sanity check.
df.info()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 591 entries, 0 to 590
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              591 non-null    int64  
 1   text            591 non-null    object 
 2   bt_readability  591 non-null    float64
 3   Source          588 non-null    object 
 4   Topic           591 non-null    object 
 5   Text type       591 non-null    object 
 6   Domain          591 non-null    object 
 7                   0 non-null      float64
dtypes: float64(2), int64(1), object(5)
memory usage: 37.1+ KB


In [3]:
text_type = df['Text type']
domain = df['Domain']
text = df['text']

Here I load python libraries I need:

In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Text cleaning:

In [5]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(clean_text)


In [6]:
# Label encode 'Text type': Simplified vs Authentic
label_encoder = LabelEncoder()
df['text_type_encoded'] = label_encoder.fit_transform(df['Text type'])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['text_type_encoded']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [13]:
y_pred = clf.predict(X_test)
print("Classification Report (Simplified vs Authentic):\n")
print(classification_report(y_test, y_pred))

Classification Report (Simplified vs Authentic):

              precision    recall  f1-score   support

           0       0.66      0.83      0.74        54
           1       0.82      0.65      0.72        65

    accuracy                           0.73       119
   macro avg       0.74      0.74      0.73       119
weighted avg       0.75      0.73      0.73       119



In [14]:
from sklearn import metrics

#print out metrics
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Logistic Regression Precision:",metrics.precision_score(y_test, y_pred))
print("Logistic Regression Recall:",metrics.recall_score(y_test, y_pred))

Logistic Regression Accuracy: 0.7310924369747899
Logistic Regression Precision: 0.8235294117647058
Logistic Regression Recall: 0.6461538461538462


In [15]:
feature_names = vectorizer.get_feature_names_out()
coefficients = clf.coef_[0]

# Top words for Authentic (positive coefficients)
top_authentic_indices = coefficients.argsort()[-20:][::-1]
print("\nTop words for Authentic texts:")
for i in top_authentic_indices:
    print(f"{feature_names[i]} ({coefficients[i]:.4f})")

# Top words for Simplified (negative coefficients)
top_simplified_indices = coefficients.argsort()[:20]
print("\nTop words for Simplified texts:")
for i in top_simplified_indices:
    print(f"{feature_names[i]} ({coefficients[i]:.4f})")


Top words for Authentic texts:
people (2.2045)
called (1.4780)
like (1.4117)
made (1.1394)
many (1.1109)
make (1.0718)
things (0.9121)
means (0.8896)
electricity (0.8863)
way (0.8789)
word (0.8443)
started (0.8016)
used (0.7764)
could (0.7470)
comes (0.7365)
computer (0.7309)
plants (0.6955)
get (0.6939)
makes (0.6839)
kinds (0.6825)

Top words for Simplified texts:
known (-1.0100)
military (-0.7670)
within (-0.7560)
system (-0.7486)
political (-0.7192)
considered (-0.7049)
although (-0.6789)
research (-0.6727)
generally (-0.6496)
term (-0.6355)
processes (-0.6187)
iron (-0.6182)
specific (-0.6076)
either (-0.6058)
typically (-0.5894)
form (-0.5712)
terms (-0.5707)
population (-0.5699)
sea (-0.5635)
environmental (-0.5621)


### Discussion
The analysis reveals that there are clear differences in important words between Simplified and Authentic reading texts. Authentic texts are characterized by more general-purpose, high-frequency words, suggesting a natural narrative or expository style. In contrast, Simplified texts unexpectedly use more technical and formal vocabulary, with a focus on defining concepts across academic domains. This indicates that simplified texts prioritize clarity and precision in presenting information, while authentic texts emphasize contextual storytelling with everyday language.