## Prvi deo zadatka

In [13]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

# Definišem skupove reči sa istim korenom i očekivanim stemovima
word_families_with_expected_stems = [
    (["Connect", "Connection", "Connects", "Connecting", "Connectivity", "Connectable"], "connect"),
    (["Move", "Moves", "Moved", "Moving", "Movement", "Movable", "Mover"], "move"),
    (["Run", "Runs", "Running", "Runner", "Ranneth", "Ran", "Rerun", "Overrun"], "run"),
    (["Build", "Builds", "Building", "Builder", "Built", "Rebuild", "Rebuilding"], "build"),
    (["Play", "Plays", "Played", "Playing", "Player", "Playful", "Replay"], "play"),
    (["Write", "Writes", "Writing", "Writer", "Written", "Rewrite"], "write"),
    (["Read", "Reads", "Reading", "Reader", "Readable", "Reread"], "read"),
    (["Speak", "Speaks", "Speaking", "Speaker", "Spoken", "Unspeakable"], "speak"),
    (["Learn", "Learns", "Learning", "Learner", "Learned", "Unlearn"], "learn"),
    (["Teach", "Teaches", "Teaching", "Teacher", "Taught"], "teach"),
    (["See", "Sees", "Seeing", "Seen", "Seer", "Foresee"], "see"),
    (["Walk", "Walks", "Walking", "Walker", "Walked"], "walk"),
    (["Eat", "Eats", "Eating", "Eater", "Ate", "Eaten", "Overeat"], "eat"),
    (["Drink", "Drinks", "Drinking", "Drinker", "Drank", "Drunk"], "drink"),
    (["Drive", "Drives", "Driving", "Driver", "Drove", "Driven"], "drive"),
    (["Grow", "Grows", "Growing", "Grown", "Grower"], "grow"),
    (["Buy", "Buys", "Buying", "Buyer", "Bought", "Rebuy"], "buy"),
    (["Sell", "Sells", "Selling", "Seller", "Sold"], "sell"),
    (["Think", "Thinks", "Thinking", "Thinker", "Thought"], "think"),
    (["Work", "Works", "Working", "Worker", "Worked"], "work")
]

# Čuvam reči i očekivane stemove u fajl
with open('word_families_with_stems.txt', 'w') as f:
    for family, stem in word_families_with_expected_stems:
        f.write(",".join(family) + ";" + stem + "\n")

# Učitavam reči i očekivane stemove iz fajla
with open('word_families_with_stems.txt', 'r') as f:
    loaded_families_with_stems = [line.strip().split(";") for line in f.readlines()]
    loaded_families = [item[0].split(",") for item in loaded_families_with_stems]
    expected_stems = [item[1] for item in loaded_families_with_stems]

In [14]:
# Inicijalizacija stemmera
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer("english")

# Funkcija za evaluaciju stemmera sa očekivanim stemovima
def evaluate_stemmer(stemmer, word_families, expected_stems):
    errors = 0
    problematic_families = []
    for i, family in enumerate(word_families):
        stems = [stemmer.stem(word) for word in family]
        if len(set(stems)) != 1 or stems[0] != expected_stems[i]:
            errors += 1
            problematic_families.append((family, stems))
    return errors, problematic_families

# Evaluacija svakog stemmera
porter_errors, porter_problems = evaluate_stemmer(porter, loaded_families, expected_stems)
lancaster_errors, lancaster_problems = evaluate_stemmer(lancaster, loaded_families, expected_stems)
snowball_errors, snowball_problems = evaluate_stemmer(snowball, loaded_families, expected_stems)

In [15]:
# Funkcija za prikaz rezultata
def display_results(stemmer_name, errors, problems):
    print(f"{stemmer_name} errors: {errors}:")
    for family, stems in problems:
        print(f"  Word Family: {family}")
        print(f"  Stemmed: {stems}")
        print("\n")
    print("\n")

# Prikaz rezultata za svaki stemmer
print("Stemmer Evaluation Results:")
display_results("Porter Stemmer", porter_errors, porter_problems)
display_results("Lancaster Stemmer", lancaster_errors, lancaster_problems)
display_results("Snowball Stemmer", snowball_errors, snowball_problems)

Stemmer Evaluation Results:
Porter Stemmer errors: 19:
  Word Family: ['Move', 'Moves', 'Moved', 'Moving', 'Movement', 'Movable', 'Mover']
  Stemmed: ['move', 'move', 'move', 'move', 'movement', 'movabl', 'mover']


  Word Family: ['Run', 'Runs', 'Running', 'Runner', 'Ranneth', 'Ran', 'Rerun', 'Overrun']
  Stemmed: ['run', 'run', 'run', 'runner', 'ranneth', 'ran', 'rerun', 'overrun']


  Word Family: ['Build', 'Builds', 'Building', 'Builder', 'Built', 'Rebuild', 'Rebuilding']
  Stemmed: ['build', 'build', 'build', 'builder', 'built', 'rebuild', 'rebuild']


  Word Family: ['Play', 'Plays', 'Played', 'Playing', 'Player', 'Playful', 'Replay']
  Stemmed: ['play', 'play', 'play', 'play', 'player', 'play', 'replay']


  Word Family: ['Write', 'Writes', 'Writing', 'Writer', 'Written', 'Rewrite']
  Stemmed: ['write', 'write', 'write', 'writer', 'written', 'rewrit']


  Word Family: ['Read', 'Reads', 'Reading', 'Reader', 'Readable', 'Reread']
  Stemmed: ['read', 'read', 'read', 'reader', 'read

In [16]:
# Najbolji stemmer
best_stemmer = min((porter_errors, "Porter Stemmer"), (lancaster_errors, "Lancaster Stemmer"), (snowball_errors, "Snowball Stemmer"))
print(f"Best stemmer: {best_stemmer[1]} with {best_stemmer[0]} errors")

Best stemmer: Lancaster Stemmer with 17 errors


## Drugi deo zadatka

In [None]:
import csv
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer  # Using with my best-performing stemmer
import pandas as pd

# Downloading necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Initializing stemmer and stop words
stemmer = LancasterStemmer()  # Replace with your best-performing stemmer
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenizing sentences
    sentences = sent_tokenize(text)
    
    # Processing each sentence
    processed_sentences = []
    for sentence in sentences:
        # Tokenizing words
        words = word_tokenize(sentence.lower())
        
        # Removing stop words and apply stemming
        processed_words = [stemmer.stem(word) for word in words if word.isalnum() and word not in stop_words]
        
        processed_sentences.append(' '.join(processed_words))
    
    return ' '.join(processed_sentences)

# Reading input CSV and processing
input_file = 'email_classification.csv'
output_file = 'processed_email_classification.csv'

with open(input_file, 'r', newline='', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames
    
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for row in reader:
        processed_email = preprocess_text(row['email'])
        row['email'] = processed_email
        writer.writerow(row)

print(f"Processed data has been saved to {output_file}")

In [20]:
df_input_file = pd.read_csv('email_classification.csv')
df_input_file

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham
...,...,...
174,We're pleased to inform you that your refund h...,ham
175,Get rich quick! Invest in our revolutionary ne...,spam
176,Your free trial period is ending soon. Upgrade...,ham
177,Your order is on its way! Track your shipment ...,ham


In [21]:
df_output_file = pd.read_csv('processed_email_classification.csv')
df_output_file

Unnamed: 0,email,label
0,upgrad prem plan exclud access prem cont feat,ham
1,happy holiday team wish joy prosp season,ham
2,hir check car opportun join dynam team,ham
3,amazon account lock click ver account inform,spam
4,opin mat tak survey help us enh expery,ham
...,...,...
174,pleas inform refund process success,ham
175,get rich quick invest revolv new scheme retir ear,spam
176,fre tri period end soon upgrad continu enjoy serv,ham
177,ord way track ship upd,ham
