# **Création d'une base de données SQL**
![alt text](diagram_db.png)

In [11]:
import sqlite3
import pandas as pd

# Create the SQL database
def create_db() -> int:
    try:
        summaries = pd.read_csv('commonlit-evaluate-student-summaries/summaries_train.csv')
        prompt = pd.read_csv('commonlit-evaluate-student-summaries/prompts_train.csv')
        conn = sqlite3.connect('database.db')
        summaries.to_sql('summaries', conn, if_exists='replace', index=False)
        prompt.to_sql('prompt', conn, if_exists='replace', index=False)
        conn.close()
        print("Database successfuly created")
    except Exception as e:
        print(f"An error occured: {e}")
        return 1
    return 0

# Read the content of a table from a request
def read_table(request : str) -> list:
    conn = sqlite3.connect('database.db')
    c = conn.cursor()
    results = c.execute(request).fetchall()
    conn.close()
    return results

# Insert rows into the summaries table
def add_to_summaries(data : dict | pd.DataFrame) -> int:
    try:
        data = list(pd.DataFrame(data).itertuples(index=False, name=None))
        conn = sqlite3.connect('database.db')
        c = conn.cursor()
        c.executemany("INSERT INTO summaries VALUES (?, ?, ?, ?, ?)", data)
        conn.commit()
        conn.close()
        print("Data imported to database")
    except Exception as e:
        print(f"An error occured: {e}")
        return 1
    return 0

# Insert rows into the prompts table
def add_to_prompt(data: dict | pd.DataFrame) -> int:
    try:
        data = list(pd.DataFrame(data).itertuples(index=False, name=None))
        conn = sqlite3.connect('database.db')
        c = conn.cursor()
        c.executemany("INSERT INTO prompts VALUES (?, ?, ?, ?)", data)
        conn.commit()
        conn.close()
        print("Data imported to database")
    except Exception as e:
        print(f"An error occured: {e}")
        return 1
    return 0

In [12]:
create_db()

Database successfuly created


0

In [4]:
request = "SELECT * FROM prompt"
read_table(request)

[('39c16e',
  'Summarize at least 3 elements of an ideal tragedy, as described by Aristotle.',
  'On Tragedy',
  'Chapter 13 \r\nAs the sequel to what has already been said, we must proceed to consider what the poet should aim at, and what he should avoid, in constructing his plots; and by what means the specific effect of Tragedy will be produced. \r\nA perfect tragedy should, as we have seen, be arranged not on the simple but on the complex plan. It should, moreover, imitate actions which excite pity and fear, this being the distinctive mark of tragic imitation. It follows plainly, in the first place, that the change of fortune presented must not be the spectacle of a virtuous man brought from prosperity to adversity: for this moves neither pity nor fear; it merely shocks us. Nor, again, that of a bad man passing from adversity to prosperity: for nothing can be more alien to the spirit of Tragedy; it possesses no single tragic quality; it neither satisfies the moral sense nor calls f

In [5]:
request = "SELECT * FROM summaries LIMIT 2"
read_table(request)

[('000e8c3c7ddb',
  '814d6b',
  'The third wave was an experimentto see how people reacted to a new one leader government. It gained popularity as people wanted to try new things. The students follow anything that is said and start turning on eachother to gain higher power. They had to stop the experement as too many people got to radical with it blindly following there leader',
  0.205682506482641,
  0.380537638762288),
 ('0020ae56ffbf',
  'ebad26',
  'They would rub it up with soda to make the smell go away and it wouldnt be a bad smell. Some of the meat would be tossed on the floor where there was sawdust spit of the workers and they would make the meat all over again with the things in it.',
  -0.548304076980462,
  0.506755353548534)]

In [13]:
data = {
    'student_id' : ['bad_student', 'good_student'],
    'prompt_id' : ['39c16e', '39c16e'],
    'text' : ['This is not a very good summary', 'This is a very good summary'],
    'content' : [0, 3],
    'wording' : [0, 3]
    }

add_to_summaries(data)

Data imported to database


0

In [16]:
request = "SELECT * FROM summaries WHERE student_id LIKE '%student'"
read_table(request)

[('bad_student', '39c16e', 'This is not a very good summary', 0.0, 0.0),
 ('good_student', '39c16e', 'This is a very good summary', 3.0, 3.0)]

# **Mise en place de tests unitaires**

In [1]:
import pytest
import ipytest

from feature_engineering import *

ipytest.autoconfig()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
%%ipytest

def test_relative_length():
    df = pd.DataFrame({
        'prompt_id': ['1', '2', '3'],
        'prompt': ['abc', 'defg', 'h']
    })
    assert relative_length(df, '1', 3) == 1.0
    assert relative_length(df, '2', 4) == 1.0
    assert relative_length(df, '3', 1) == 1.0
    with pytest.raises(IndexError):
        relative_length(df, '4', 1)

def test_count_stopwords():
    assert count_stopwords("This is a test.") == 3
    assert count_stopwords("No stop words here.") == 1
    assert count_stopwords("") == 0
    assert count_stopwords("The quick brown fox jumps over the lazy dog.") == 3
    with pytest.raises(AttributeError):
        count_stopwords(123)

def test_count_punctuation():
    assert count_punctuation("Hello, world!") == 2
    assert count_punctuation("No punctuation here") == 0
    assert count_punctuation("") == 0
    assert count_punctuation("!#$%&'()*+,-./:;<=>?@[\]^_`{|}~"+'"') == 32
    with pytest.raises(TypeError):
        count_punctuation(123)

def test_count_numbers():
    assert count_numbers("123 abc 456") == 2
    assert count_numbers("No numbers here") == 0
    assert count_numbers("") == 0
    assert count_numbers("1234567890") == 1
    with pytest.raises(TypeError):
        count_numbers(123)

def test_lemmatize_text():
    assert lemmatize_text("He is running.") == ['run']
    assert lemmatize_text("No stop words here.") == ['stop', 'word']
    assert lemmatize_text("") == []
    assert lemmatize_text("The quick brown fox jumps over the lazy dog.") == ['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']
    with pytest.raises(AttributeError):
        lemmatize_text(123)

def test_lemmatize():
    df = pd.DataFrame({
        'text1': ['He is running.', 'No stop words here.', '', 'The quick brown fox jumps over the lazy dog.'],
        'text2': ['Another text.', 'More words here.', 'Empty string.', 'A sentence with punctuation!']
    })
    df_result = lemmatize(df, ['text1', 'text2'])
    assert df_result['text1_lemmatized'].tolist() == [['run'], ['stop', 'word'], [], ['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']]
    assert df_result['text2_lemmatized'].tolist() == [['another', 'text'], ['word'], ['empty', 'string'], ['sentence', 'punctuation']]
    with pytest.raises(KeyError):
        lemmatize(df, 'non_existing_column')
    df['non_string_column'] = [1, 2, 3, 4]
    with pytest.raises(AttributeError):
        lemmatize(df, 'non_string_column')


def test_count_unique_words():
    assert count_unique_words(['run', 'run', 'jump']) == 2
    assert count_unique_words(['word']) == 1
    assert count_unique_words([]) == 0
    assert count_unique_words(['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']) == 6
    with pytest.raises(TypeError):
        count_unique_words("not a list")

def test_vectorizer():
    df = pd.DataFrame({
        'prompt_id': ['1', '2'],
        'prompt_lemmatized': [['run', 'run', 'jump'], ['word']],
        'prompt_question_lemmatized': [['quick', 'brown', 'fox', 'jump', 'lazy', 'dog'], ['another', 'word']]
    })
    vectorizer_dict = vectorizer(df)
    assert set(vectorizer_dict.keys()) == {'1', '2'}
    assert set(vectorizer_dict['1'].keys()) == {'prompt', 'prompt_question'}
    with pytest.raises(AttributeError):
        vectorizer("not a DataFrame")

def test_vectorize():
    vectorizer = CountVectorizer()
    vectorizer.fit(['run jump jump', 'word'])
    assert np.array_equal(vectorize(['run', 'run', 'jump'], vectorizer), np.array([[1, 2, 0]]))
    assert np.array_equal(vectorize(['word'], vectorizer), np.array([[0, 0, 1]]))
    assert np.array_equal(vectorize([], vectorizer), np.array([[0, 0, 0]]))
    with pytest.raises(TypeError):
        vectorize("not a list", vectorizer)
    with pytest.raises(AttributeError):
        vectorize(['run', 'run', 'jump'], "not a CountVectorizer")

def test_jaccard_similarity():
    assert jaccard_similarity(np.array([[1, 1, 0]]), np.array([[1, 0, 0]])) == 0.5
    assert jaccard_similarity(np.array([[1, 1, 1]]), np.array([[1, 1, 1]])) == 1.0
    assert jaccard_similarity(np.array([[0, 0, 0]]), np.array([[1, 1, 1]])) == 0.0
    assert jaccard_similarity(np.array([[1, 0, 1]]), np.array([[0, 1, 0]])) == 0.0
    assert jaccard_similarity(np.array([[1, 2, 3]]), np.array([[1, 2, 3]])) == 1.0
    with pytest.raises(ValueError):
        jaccard_similarity("not a ndarray", np.array([[1, 1, 0]]))
    with pytest.raises(ValueError):
        jaccard_similarity(np.array([[1, 1]]), np.array([[1, 1, 0]]))

def test_ner():
    assert ner("Apple is looking at buying U.K. startup for $1 billion") == [('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]
    assert ner("He was born on 2000-01-01.") == [('2000-01-01', 'DATE')]
    assert ner("") == []
    with pytest.raises(ValueError):
        ner(123)

def test_jaccard_similarity_ner():
    assert jaccard_similarity_ner([('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')], [('Apple', 'ORG'), ('U.K.', 'GPE')]) == 2/3
    assert jaccard_similarity_ner([('2000-01-01', 'DATE')], [('2000-01-01', 'DATE')]) == 1.0
    assert jaccard_similarity_ner([], [('2000-01-01', 'DATE')]) == 0.0
    assert jaccard_similarity_ner([('Apple', 'ORG'), ('U.K.', 'GPE')], [('Google', 'ORG'), ('U.S.', 'GPE')]) == 0.0
    with pytest.raises(TypeError):
        jaccard_similarity_ner("not a list", [('Apple', 'ORG'), ('U.K.', 'GPE')])

def test_readability():
    assert readability("The cat sat on the mat.").equals(pd.Series([116.15, 2.4, -4.4]))
    assert readability("This is a more complex sentence, with more words and more syllables.").equals(pd.Series([84.68, 8.13, 6.9]))
    assert readability("").equals(pd.Series([206.84, 0.0, 0.0]))
    with pytest.raises(TypeError):
        readability(123)

def test_cosine_similarity_sentiment():
    assert cosine_similarity_sentiment((1, 0), (1, 0)) == 1.0
    assert cosine_similarity_sentiment((1, 0), (0, 1)) == 0.0
    assert cosine_similarity_sentiment((1, 0), (-1, 0)) == -1.0
    assert cosine_similarity_sentiment((0, 0), (1, 0)) == 0.0
    with pytest.raises(ValueError):
        cosine_similarity_sentiment("not a tuple", (1, 0))
    with pytest.raises(ValueError):
        cosine_similarity_sentiment((1, 0), (1, 0, 0))

def test_sentiment():
    df = pd.DataFrame({
        'prompt_id': ['1', '2'],
        'blob': [TextBlob('I love this product.'), TextBlob('I hate this product.')]
    })
    assert sentiment(df, 'I love this product.', '1') == 1.0
    assert sentiment(df, 'I hate this product.', '2') == 1.0
    assert sentiment(df, 'I love this product.', '2') == 0.1488603778620425
    assert sentiment(df, 'I hate this product.', '1') == 0.1488603778620425
    with pytest.raises(IndexError):
        sentiment(df, 'I love this product.', '3')
    with pytest.raises(TypeError):
        sentiment(df, 123, '1')

def test_tokenize():
    assert tokenize("The quick brown fox jumps over the lazy dog.") == 'DT JJ NN NN VBZ IN DT JJ NN .'
    assert tokenize("No punctuation here") == 'DT NN RB'
    assert tokenize("") == ''
    with pytest.raises(TypeError):
        tokenize(123)

def test_tfidf_vectorizer():
    df_prompt = pd.DataFrame({
        'tokens': ['DT JJ NN NN VBZ IN DT JJ NN .', 'DT NN RB', '']
    })
    df_summaries = pd.DataFrame({
        'tokens': ['DT JJ NN NN VBZ IN DT JJ NN .', 'DT NN RB', '']
    })
    vectorizer_tfidf = tfidf_vectorizer(df_prompt, df_summaries)
    assert isinstance(vectorizer_tfidf, TfidfVectorizer)
    assert vectorizer_tfidf.ngram_range == (4, 4)
    with pytest.raises(TypeError):
        tfidf_vectorizer("not a DataFrame", df_summaries)
    with pytest.raises(TypeError):
        tfidf_vectorizer(df_prompt, "not a DataFrame")


[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                                            [100%][0m
[32m[32m[1m17 passed[0m[32m in 0.11s[0m[0m
