## Data Collection


In [155]:
import requests
from bs4 import BeautifulSoup
import praw
import pandas as pd
import re
from time import sleep


In [159]:
# Forum Scraper (Greeklish)
def scrape_forum():

    """Scrapes forum posts from insomnia.gr.

    Returns:
        A list of forum post texts.
    """

    url = 'https://www.insomnia.gr/forums/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    posts = soup.find_all('p', class_='ipsHide')  # Adjust class based on site
    return [post.text for post in posts[:100]]

# YouTube Comments (Greeklish)
def scrape_youtube():
    """Scrapes comments from a YouTube video.

    Returns:
        A list of comment texts.
    """
    url = 'https://www.youtube.com/watch?v=_akH1Bns2B8'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    comments = soup.find_all('yt-formatted-string', class_='style-scope ytd-comment-renderer')
    return [comment.text for comment in comments[:100]]

# Reddit Scraper

def is_valid_greeklish(text):

    """Checks if a text is likely to be Greeklish.

    Args:
        text: The text to check.

    Returns:
        True if the text is likely to be Greeklish, False otherwise.
    """

    if len(text) < 3 or text in ["[deleted]", "[removed]"]:
        return False

    # Reject Greek/Cyrillic scripts
    if re.search(r'[α-ωά-ώΑ-Ω]', text) or re.search(r'[а-яА-Я]', text):
        return False

    # Reject excessive numbers/leet (e.g., "k4n315")
    if sum(c.isdigit() for c in text) > len(text) * 0.2:
        return False

    # Key Greeklish words (customize as needed)
    greeklish_words = {
        "kaneis", "einai", "thelw", "gia", "auto", "kai", "den", "ti", "mou",
        "sou", "tora", "minymata", "ellinika", "greeklish", "vlepw", "kserw"
    }
    words = set(re.findall(r'\b\w+\b', text))
    return len(words & greeklish_words) >= 2  # At least 2 Greeklish words


def scrape_reddit():

    """Scrapes Reddit posts from the r/greece subreddit.

    Returns:
        A list of Reddit post titles.
    """

    posts = []
    reddit = praw.Reddit(client_id='XYqJYrsAQ1NjcH2po55CIA', client_secret='WJS-uobHelasoF3khZb4KLPnVq05Yg', user_agent='GreeklishScraper/0.0.1', check_for_async=False)
    subreddit = reddit.subreddit("greece")
    for submission in subreddit.search("greeklish", limit=400):
      if is_valid_greeklish(submission.title):
        posts.append({"text": submission.title, "label": "greeklish"})

      if submission.comments:
           submission.comments.replace_more(limit=0)
           for comment in submission.comments.list():
             if is_valid_greeklish(comment.body):
                 posts.append({"text": comment.body, "label": "greeklish"})
    return [submission.title for submission in subreddit.search("greeklish", limit=400)]

# (English)


def clean_text(text):

    """Cleans and preprocesses text data.

    Args:
        text: The text to clean.

    Returns:
        The cleaned text.
    """

    # Remove extra whitespace and special characters
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def scrape_reddit_sentences(subreddit_name="AskReddit", min_sentences=150):
    """Scrapes sentences from Reddit posts and comments.

    Args:
        subreddit_name: The name of the subreddit to scrape.
        min_sentences: The minimum number of sentences to scrape.

    Returns:
        A list of sentences.
    """
    # Initialize Reddit API
    reddit = praw.Reddit(
        client_id="XYqJYrsAQ1NjcH2po55CIA",
        client_secret="WJS-uobHelasoF3khZb4KLPnVq05Yg",
        user_agent="sentence_scraper_v1"
    )

    all_sentences = []
    subreddit = reddit.subreddit(subreddit_name)

    # Get top posts and comments
    for submission in subreddit.top(limit=50):
        submission.comments.replace_more(limit=0)
        # Add submission title
        title_sentences = nltk.sent_tokenize(clean_text(submission.title))
        all_sentences.extend(title_sentences)

        # Add comments
        for comment in submission.comments.list():
            if hasattr(comment, 'body'):
                comment_sentences = nltk.sent_tokenize(clean_text(comment.body))
                all_sentences.extend(comment_sentences)

        if len(all_sentences) >= min_sentences:
            break

    return all_sentences[:min_sentences]

def scrape_wikipedia_sentences(url, min_sentences=150):

    """Scrapes sentences from Wikipedia articles.

    Args:
        url: The URL of the Wikipedia article.
        min_sentences: The minimum number of sentences to scrape.

    Returns:
        A list of sentences.
    """
    # Set headers to mimic browser
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find content paragraphs (Wikipedia uses 'mw-paragraph' in content)
    paragraphs = soup.find_all('p')

    all_sentences = []
    for para in paragraphs:
        text = para.get_text()
        # Remove citation numbers like [1], [2] etc.
        text = re.sub(r'\[\d+\]', '', text)
        sentences = nltk.sent_tokenize(clean_text(text))
        all_sentences.extend(sentences)

    return all_sentences[:min_sentences] if len(all_sentences) > min_sentences else all_sentences



# Combine data
def collect_data():

    """Collects Greeklish and English data.

    Returns:
        A tuple containing two lists: Greeklish data and English data.
    """

    greeklish = scrape_youtube() + scrape_forum() + scrape_reddit()
    english = scrape_wikipedia_sentences("https://en.wikipedia.org/wiki/Artificial_intelligence",200) + scrape_reddit_sentences("AskReddit",200)
    return greeklish, english


In [160]:
# Save to CSV
greeklish_data, english_data = collect_data()
# Check the length of the data and adjust labels accordingly
greeklish_len = len(greeklish_data)
english_len = len(english_data)

df = pd.DataFrame({
    'text': greeklish_data + english_data,
    'label': ['Greeklish'] * greeklish_len + ['English'] * english_len
})
df.shape

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



(593, 2)

In [161]:
df.to_csv('initial_dataset.csv', index=False)

## Data Preparation

In [162]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/initial_dataset.csv')


def split_into_sentences(paragraph):
    # Simple sentence splitting based on periods, exclamation, and question marks
    sentences = [s.strip() for s in paragraph.replace('!', '.').replace('?', '.').split('.') if s.strip()]
    return sentences

# Create a new DataFrame with one sentence per row
new_rows = []
for index, row in df.iterrows():
    sentences = split_into_sentences(row['text'])
    for sentence in sentences:
        new_row = row.copy()
        new_row['text'] = sentence
        new_rows.append(new_row)

# Create new DataFrame
new_df = pd.DataFrame(new_rows)

# Reset index if needed
new_df = new_df.reset_index(drop=True)

# Save to new CSV file
new_df.to_csv('output.csv', index=False)

print(f"Original DataFrame size: {len(df)} rows")
print(f"New DataFrame size: {len(new_df)} rows")

Original DataFrame size: 593 rows
New DataFrame size: 674 rows


In [111]:
# Import libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from google.colab import files

In [113]:
# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [180]:
df = pd.read_csv('/content/final_dataset.csv')

In [181]:
# Verify the data
print("Initial data preview:")
print(df.head())
print(f"Total sentences: {len(df)}")

Initial data preview:
                                                                                                  text  \
0  den eixa ellinika sto kinito mou gia xronia kai akoma kai twra pou exw stelnw minymata se greekl...   
1                8ymamai na einai 2003 kai na vlepw greeklish na xrisimopoiountai akomi kai sta forums   
2                                                                         eixan ki afta tin plaka tous   
3                                                                  episis, 8ym4741 k4n315 70 13375p34k   
4                                                                                                   :D   

       label  
0  greeklish  
1  greeklish  
2  greeklish  
3  greeklish  
4  greeklish  
Total sentences: 904


In [182]:
# Cleaning and preprocessing function
def preprocess_text(text):

    """Cleans and preprocesses a sentence.

    Args:
        text: The sentence to preprocess.

    Returns:
        The preprocessed sentence.
    """

    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and extra whitespace
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords (English stopwords; minimal impact on Greeklish)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Join tokens back into a sentence
    return ' '.join(tokens)

In [183]:
# Apply preprocessing
df['cleaned_sentence'] = df['text'].apply(preprocess_text)

In [184]:
# Drop empty rows after cleaning
df = df[df['cleaned_sentence'] != ''].reset_index(drop=True)

In [185]:
# Prepare final DataFrame (rename 'text' to 'sentence' as per convention)
df_final = df[['cleaned_sentence', 'label']].rename(columns={'cleaned_sentence': 'sentence'})

In [186]:
# Verify class distribution
print("\nClass distribution:")
print(df_final['label'].value_counts())


Class distribution:
label
greeklish    497
English      388
Name: count, dtype: int64


In [187]:
# Save preprocessed data
df_final.to_csv('preprocessed_sentences.csv', index=False)
print("\nPreprocessed data saved as 'preprocessed_sentences.csv' and downloaded.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Preprocessed data saved as 'preprocessed_sentences.csv' and downloaded.


## Model Development

In [188]:
# Split data into features (X) and labels (y)
X = df_final['sentence']
y = df_final['label']

In [189]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [190]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features for efficiency
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [191]:
# Train Logistic Regression classifier
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

In [192]:
# Predict on test set
y_pred = model.predict(X_test_tfidf)

In [193]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [194]:
# Display results
print("\nModel Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Model Performance:
Accuracy: 0.9492
Precision: 0.9544
Recall: 0.9492
F1-Score: 0.9493


In [149]:
# Save the model and vectorizer using joblib to the 'model' directory
import joblib
import os

# Create a directory to store the model files
os.makedirs('model', exist_ok=True)

joblib.dump(model, 'model/greeklish_classifier.pkl')
joblib.dump(vectorizer, 'model/tfidf_vectorizer.pkl')
print("\nModel and vectorizer saved to the 'model' directory.")

# Download model files
!zip -r model.zip model  # Zip the 'model' directory
from google.colab import files
files.download('model.zip')
print("Model files zipped and downloaded as 'model.zip'.")


Model and vectorizer saved to the 'model' directory.
  adding: model/ (stored 0%)
  adding: model/greeklish_classifier.pkl (deflated 42%)
  adding: model/tfidf_vectorizer.pkl (deflated 75%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model files zipped and downloaded as 'model.zip'.


In [154]:
# Load model
model = joblib.load("greeklish_classifier.pkl")

# Function to classify a sentence
def predict_text(text):

    """Classifies a sentence as Greeklish or English.

    Args:
        text: The sentence to classify.

    Returns:
        The predicted label ('Greeklish' or 'English').
    """

    processed_text = preprocess_text(text)
    # Reshape the processed_text to a 2D array with one column
    # The following line is changed
    prediction = model.predict(vectorizer.transform([processed_text]))[0]
    return prediction

# Test examples
print(predict_text("ti kaneis"))  # Expected output: Greeklish
print(predict_text("Hello, how are you?"))  # Expected output: English

greeklish
English
