In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# 1. Download NLTK resources (run once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# 2. Load your scraped data
df = pd.read_csv('interstellar_posts_with_votes.csv')

# 3. Keep only the columns you need for text analysis
df = df[['SNO', 'SUBREDDIT_NAME', 'TEXT', 'DATE', 'UPVOTES', 'DOWNVOTES']]

# 4. Drop rows where TEXT is missing
df.dropna(subset=['TEXT'], inplace=True)

# 5. Remove exact duplicate posts
df.drop_duplicates(subset=['TEXT'], inplace=True)

# 6. Define negative words to retain
negative_words = {
    "not", "no", "nor", "never", "nothing", "nowhere",
    "neither", "cannot", "n't", "without", "barely", "hardly", "scarcely"
}

# 7. Cleaning function
def clean_text(text):
    # (a) Remove URLs
    text = re.sub(r'http\S+', '', text)
    # (b) Remove u/ or r/ mentions
    text = re.sub(r'/?[ur]\/\w+', '', text)
    # (c) Keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # (d) Lowercase
    text = text.lower()
    # (e) Tokenize
    tokens = word_tokenize(text)
    # (f) Remove stopwords except negative words
    sw = set(stopwords.words('english'))
    tokens = [tok for tok in tokens if (tok not in sw or tok in negative_words)]
    # (g) Remove any leftover one-letter tokens (optional)
    tokens = [tok for tok in tokens if len(tok) > 1]
    return ' '.join(tokens)

# 8. Apply cleaning
df['cleaned_text'] = df['TEXT'].apply(clean_text)

# 9. (Optional) Remove rows where cleaned_text is empty
df = df[df['cleaned_text'].str.strip() != '']

# 10. Inspect the first few rows
print(df[['SNO','SUBREDDIT_NAME','cleaned_text','UPVOTES','DOWNVOTES']].head())

# 11. Save to a new CSV
df.to_csv('interstellar_posts_preprocessed.csv', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


   SNO SUBREDDIT_NAME                                       cleaned_text  \
0    1   interstellar  monthly interstellar showings megathread greet...   
1    2   interstellar  new rule no photos videos theatrical screening...   
2    3   interstellar      look docking scene brand cooper cooper buying   
3    4   interstellar  stop spin observe wormhole revert back zero gr...   
4    5   interstellar  come tars know deliberate reference brightened...   

   UPVOTES  DOWNVOTES  
0        5          0  
1      362          0  
2      238          0  
3      117          0  
4      122          0  
