In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


def read_text(filename, encoding):
    with open(filename, 'r', encoding=encoding) as file:
        return file.read()


data = {
    'Text': [
        read_text('Anglais_10.txt', 'utf-8'), 
        read_text('Français_1.txt', 'windows-1252'),  
        read_text('Chinois_1.txt', 'utf-8') 
    ],
    'Language': ['English', 'French', 'Chinese']
}

df = pd.DataFrame(data)


english_texts = {
    'Text': df[df['Language'] == 'English']['Text'].iloc[0].split('.')[:10],  # Simulated split into sentences
    'Is_Native': ['Native', 'Non-Native', 'Native', 'Non-Native', 'Native', 'Native', 'Non-Native', 'Native', 'Non-Native', 'Native']
}
df_english = pd.DataFrame(english_texts)


vectorizer_lang = CountVectorizer(analyzer='char', ngram_range=(1, 3))
X_lang = vectorizer_lang.fit_transform(df['Text'])
y_lang = df['Language']


model_lang = MultinomialNB()
model_lang.fit(X_lang, y_lang)


vectorizer_eng = TfidfVectorizer(ngram_range=(1, 3))
X_eng = vectorizer_eng.fit_transform(df_english['Text'])
y_eng = df_english['Is_Native']


model_eng = MultinomialNB()
model_eng.fit(X_eng, y_eng)


y_pred_lang = model_lang.predict(X_lang)
print("Language Detection Accuracy:", accuracy_score(y_lang, y_pred_lang))
print("Language Classification Report:\n", classification_report(y_lang, y_pred_lang))

y_pred_eng = model_eng.predict(X_eng)
print("English Native/Non-Native Detection Accuracy:", accuracy_score(y_eng, y_pred_eng))
print("English Text Classification Report:\n", classification_report(y_eng, y_pred_eng))


Language Detection Accuracy: 1.0
Language Classification Report:
               precision    recall  f1-score   support

     Chinese       1.00      1.00      1.00         1
     English       1.00      1.00      1.00         1
      French       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

English Native/Non-Native Detection Accuracy: 1.0
English Text Classification Report:
               precision    recall  f1-score   support

      Native       1.00      1.00      1.00         6
  Non-Native       1.00      1.00      1.00         4

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

